├── examples
    ├── __init__.py
    ├── saved_logs
    │   └── example_test_tube_data
    │   │   ├── demo_test_0
    │   │       ├── version_0
    │   │       │   ├── meta_tags.json
    │   │       │   ├── media
    │   │       │   │   └── jpg_0.jpg
    │   │       │   └── metrics.csv
    │   │       └── version_1
    │   │       │   ├── meta_tags.json
    │   │       │   ├── media
    │   │       │       └── jpg_0.jpg
    │   │       │   └── metrics.csv
    │   │   └── demo_test_1
    │   │       ├── version_0
    │   │           ├── meta_tags.json
    │   │           ├── media
    │   │           │   └── jpg_0.jpg
    │   │           └── metrics.csv
    │   │       └── version_1
    │   │           ├── meta_tags.json
    │   │           ├── media
    │   │               └── jpg_0.jpg
    │   │           └── metrics.csv
    ├── tensorflow_example.py
    ├── pytorch_hpc_example.py
    └── hpc_cpu_example.py
├── test_tube
    ├── hyper_opt_utils
    │   ├── __init__.py
    │   └── strategies.py
    ├── .DS_Store
    ├── __init__.py
    ├── hyperopt.py
    ├── argparse_hopt.py
    ├── hpc.py
    └── log.py
├── .DS_Store
├── imgs
    ├── viz_a.png
    └── test_tube_logo.png
├── docs
    ├── img
    │   └── viz_a.png
    ├── index.md
    ├── experiment_tracking
    │   └── experiment.md
    ├── hyperparameter_optimization
    │   └── HyperOptArgumentParser.md
    └── hpc
    │   └── SlurmCluster.md
├── site
    ├── img
    │   ├── viz_a.png
    │   └── favicon.ico
    ├── sitemap.xml.gz
    ├── fonts
    │   ├── fontawesome-webfont.eot
    │   ├── fontawesome-webfont.ttf
    │   └── fontawesome-webfont.woff
    ├── sitemap.xml
    ├── search
    │   ├── main.js
    │   └── worker.js
    ├── js
    │   ├── theme.js
    │   └── modernizr-2.8.3.min.js
    ├── css
    │   └── theme_extra.css
    ├── 404.html
    ├── search.html
    ├── index.html
    ├── experiment_tracking
    │   └── experiment
    │   │   └── index.html
    ├── hyperparameter_optimization
    │   └── HyperOptArgumentParser
    │   │   └── index.html
    └── hpc
    │   └── SlurmCluster
    │       └── index.html
├── requirements.txt
├── tests
    ├── log_test.py
    ├── argparse_hopt_test.py
    ├── hpc_test.py
    └── strategies_test.py
├── update.sh
├── mkdocs.yml
├── setup.cfg
├── .travis.yml
├── LICENSE
├── setup.py
├── .gitignore
└── README.md


/examples/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test_tube/hyper_opt_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/.DS_Store


--------------------------------------------------------------------------------
/imgs/viz_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/imgs/viz_a.png


--------------------------------------------------------------------------------
/docs/img/viz_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/docs/img/viz_a.png


--------------------------------------------------------------------------------
/site/img/viz_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/site/img/viz_a.png


--------------------------------------------------------------------------------
/site/sitemap.xml.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/site/sitemap.xml.gz


--------------------------------------------------------------------------------
/test_tube/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/test_tube/.DS_Store


--------------------------------------------------------------------------------
/site/img/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/site/img/favicon.ico


--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_0/version_0/meta_tags.json:
--------------------------------------------------------------------------------
1 | {"tag_b": "s", "tag_a": 2}


--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_0/version_1/meta_tags.json:
--------------------------------------------------------------------------------
1 | {"tag_a": 2, "tag_b": "s"}


--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_1/version_0/meta_tags.json:
--------------------------------------------------------------------------------
1 | {"tag_b": "s", "tag_a": 2}


--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_1/version_1/meta_tags.json:
--------------------------------------------------------------------------------
1 | {"tag_a": 2, "tag_b": "s"}


--------------------------------------------------------------------------------
/imgs/test_tube_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/imgs/test_tube_logo.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas>=0.20.3
2 | numpy>=1.13.3
3 | imageio>=2.3.0
4 | tensorboard>=1.15.0
5 | torch>=1.1.0
6 | future


--------------------------------------------------------------------------------
/site/fonts/fontawesome-webfont.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/site/fonts/fontawesome-webfont.eot


--------------------------------------------------------------------------------
/site/fonts/fontawesome-webfont.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/site/fonts/fontawesome-webfont.ttf


--------------------------------------------------------------------------------
/site/fonts/fontawesome-webfont.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/site/fonts/fontawesome-webfont.woff


--------------------------------------------------------------------------------
/tests/log_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | 
4 | def test_hello():
5 |     assert 4==4
6 | 
7 | if __name__ == '__main__':
8 |     pytest.main([__file__])
9 | 


--------------------------------------------------------------------------------
/tests/argparse_hopt_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | 
4 | def test_hello():
5 |     assert 4==4
6 | 
7 | if __name__ == '__main__':
8 |     pytest.main([__file__])
9 | 


--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_0/version_0/media/jpg_0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/examples/saved_logs/example_test_tube_data/demo_test_0/version_0/media/jpg_0.jpg


--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_0/version_1/media/jpg_0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/examples/saved_logs/example_test_tube_data/demo_test_0/version_1/media/jpg_0.jpg


--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_1/version_0/media/jpg_0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/examples/saved_logs/example_test_tube_data/demo_test_1/version_0/media/jpg_0.jpg


--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_1/version_1/media/jpg_0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/examples/saved_logs/example_test_tube_data/demo_test_1/version_1/media/jpg_0.jpg


--------------------------------------------------------------------------------
/test_tube/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Experiment logger module
3 | """
4 | 
5 | from .argparse_hopt import HyperOptArgumentParser
6 | from .hpc import SlurmCluster
7 | from .hyperopt import HyperParamOptimizer
8 | from .log import Experiment
9 | 


--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_0/version_0/metrics.csv:
--------------------------------------------------------------------------------
1 | created_at,fake_jpg,row_3,test
2 | 2017-10-13 02:07:28.005016,/Users/waf/test_tube_data/demo_test_0/version_0/media/jpg_0.jpg,,2
3 | 2017-10-13 02:07:28.005031,,3,2
4 | 


--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_0/version_1/metrics.csv:
--------------------------------------------------------------------------------
1 | created_at,fake_jpg,row_3,test
2 | 2017-10-13 02:07:37.395603,/Users/waf/Developer/log_suite/test_tube/test_tube/test_tube_data/demo_test_0/version_1/media/jpg_0.jpg,,2
3 | 2017-10-13 02:07:37.395635,,3,2
4 | 


--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_1/version_0/metrics.csv:
--------------------------------------------------------------------------------
1 | created_at,fake_jpg,row_3,test
2 | 2017-10-13 02:07:28.035057,/Users/waf/Developer/log_suite/test_tube/test_tube/test_tube_data/demo_test_1/version_0/media/jpg_0.jpg,,2
3 | 2017-10-13 02:07:28.035086,,3,2
4 | 


--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_1/version_1/metrics.csv:
--------------------------------------------------------------------------------
1 | created_at,fake_jpg,row_3,test
2 | 2017-10-13 02:07:37.443175,/Users/waf/Developer/log_suite/test_tube/test_tube/test_tube_data/demo_test_1/version_1/media/jpg_0.jpg,,2
3 | 2017-10-13 02:07:37.443252,,3,2
4 | 


--------------------------------------------------------------------------------
/update.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | version=$1
 4 | 
 5 | git commit -am "release v$version"
 6 | git tag $version -m "test_tube v$version"
 7 | git push --tags origin master
 8 | 
 9 | # push to pypi
10 | rm -rf ./dist/*
11 | python3 setup.py sdist
12 | twine upload dist/*
13 | 
14 | 
15 | 
16 | # to update docs
17 | # cd to root dir
18 | # mkdocs gh-deploy
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: Test tube Documentation
2 | theme: readthedocs
3 | docs_dir: docs
4 | repo_url: https://github.com/williamFalcon/test_tube
5 | site_dir: 'site'
6 | site_description: 'Documentation for Test Tube, the Python Deep Learning and Machine Learning experiment tracking and tuning framework.'
7 | 
8 | dev_addr: '0.0.0.0:8000'
9 | #google_analytics: ['UA-aasd', 'sitename']


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | description-file = README.md
 3 | 
 4 | [yapf]
 5 | align_closing_bracket_with_visual_indent = True
 6 | # Put braces on their own line.
 7 | dedent_closing_brackets = True
 8 | split_before_closing_bracket = True
 9 | indent_width = 4
10 | coalesce_brackets = True
11 | allow_multiline_lambdas = True
12 | join_multiple_lines = True
13 | spaces_around_power_operator = False
14 | column_limit = 100
15 | 


--------------------------------------------------------------------------------
/site/sitemap.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 3 |     <url>
 4 |      <loc>None</loc>
 5 |      <lastmod>2019-08-03</lastmod>
 6 |      <changefreq>daily</changefreq>
 7 |     </url>
 8 |     <url>
 9 |      <loc>None</loc>
10 |      <lastmod>2019-08-03</lastmod>
11 |      <changefreq>daily</changefreq>
12 |     </url>
13 |     <url>
14 |      <loc>None</loc>
15 |      <lastmod>2019-08-03</lastmod>
16 |      <changefreq>daily</changefreq>
17 |     </url>
18 |     <url>
19 |      <loc>None</loc>
20 |      <lastmod>2019-08-03</lastmod>
21 |      <changefreq>daily</changefreq>
22 |     </url>
23 | </urlset>


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | # command to install dependencies
 3 | cache: pip
 4 | 
 5 | matrix:
 6 |   include:
 7 |     - python: 3.6
 8 |       dist: xenial  # Ubuntu 16.04
 9 |       env:
10 |         - MIN_REQUIREMENTS=1
11 |     - python: 3.6
12 |       dist: bionic  # Ubuntu 18.04
13 |     - python: 3.7
14 |       dist: bionic  # Ubuntu 18.04
15 | 
16 | install:
17 |   - if [[ "${MIN_REQUIREMENTS}" == "1" ]]; then
18 |       python -c "req = open('requirements.txt').read().replace('>', '=') ; open('requirements-ci.txt', 'w').write(req)" ;
19 |       pip install -r requirements-ci.txt ;
20 |     fi
21 |   - pip install -e .
22 | 
23 | 
24 | # command to run tests
25 | script:
26 |   - pytest # or py.test for Python versions 3.5 and below
27 | 
28 | notifications:
29 |   email: false
30 | 


--------------------------------------------------------------------------------
/tests/hpc_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from test_tube.argparse_hopt import HyperOptArgumentParser
 4 | from test_tube.hpc import SlurmCluster
 5 | 
 6 | 
 7 | def test_slurm_time_to_seconds():
 8 |     parser = HyperOptArgumentParser()
 9 |     parsed = parser.parse_args()
10 |     cluster = SlurmCluster(log_path='/home/travis', hyperparam_optimizer=parsed)
11 | 
12 |     assert cluster.slurm_time_to_seconds('15:00') == 900
13 |     assert cluster.slurm_time_to_seconds('1-12:20:12') == 130812
14 |     assert cluster.slurm_time_to_seconds('1:20:12') == 4812
15 |     assert cluster.slurm_time_to_seconds('00:20:12') == 1212
16 |     assert cluster.slurm_time_to_seconds('00:00:12') == 12
17 |     assert cluster.slurm_time_to_seconds('12') == 12
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     pytest.main([__file__])
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2017-2018 William Falcon
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | from setuptools import find_packages, setup
 4 | 
 5 | version = '0.7.5'
 6 | PATH_ROOT = os.path.dirname(__file__)
 7 | 
 8 | 
 9 | def load_requirements(path_dir=PATH_ROOT, comment_char='#'):
10 |     with open(os.path.join(path_dir, 'requirements.txt'), 'r') as file:
11 |         lines = [ln.strip() for ln in file.readlines()]
12 |     reqs = []
13 |     for ln in lines:
14 |         # filer all comments
15 |         if comment_char in ln:
16 |             ln = ln[:ln.index(comment_char)]
17 |         if ln:  # if requirement is not empty
18 |             reqs.append(ln)
19 |     return reqs
20 | 
21 | 
22 | setup(
23 |     name='test_tube',
24 |     packages=find_packages(),
25 |     version=version,
26 |     description='Experiment logger and visualizer',
27 |     author='William Falcon',
28 |     install_requires=load_requirements(PATH_ROOT),
29 |     author_email='will@hacstudios.com',
30 |     url='https://github.com/williamFalcon/test_tube',
31 |     download_url='https://github.com/williamFalcon/test_tube/archive/{}.tar.gz'.format(version),
32 |     keywords=[
33 |         'testing',
34 |         'machine learning',
35 |         'deep learning',
36 |         'prototyping',
37 |         'experimenting',
38 |         'modeling',
39 |     ],
40 | )
41 | 


--------------------------------------------------------------------------------
/tests/strategies_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from test_tube.hyper_opt_utils import strategies
 4 | 
 5 | GRID_SEARCH = 'grid_search'
 6 | RANDOM_SEARCH = 'random_search'
 7 | 
 8 | FLAT_PARAMS = [
 9 |     [
10 |         {'idx': 0, 'val': 0.0001, 'name': 'learning_rate'},
11 |         {'idx': 1, 'val': 0.001, 'name': 'learning_rate'},
12 |         {'idx': 2, 'val': 0.01, 'name': 'learning_rate'},
13 |         {'idx': 3, 'val': 0.1, 'name': 'learning_rate'}
14 |     ],
15 |     [
16 |         {'idx': 4, 'val': 0.99, 'name': 'decay'},
17 |         {'idx': 5, 'val': 0.999, 'name': 'decay'},
18 |     ]
19 | ]
20 | def test_unknown_strategy():
21 |     with pytest.raises(ValueError):
22 |         strategies.generate_trials(
23 |             'unknown_strategy', FLAT_PARAMS, nb_trials=None)
24 | 
25 | def test_grid_search_no_limit():
26 |     trials = strategies.generate_trials(
27 |         GRID_SEARCH, FLAT_PARAMS, nb_trials=None)
28 |     assert len(trials) == len(FLAT_PARAMS[0]) * len(FLAT_PARAMS[1])
29 | 
30 | def test_grid_search_limit():
31 |     trials = strategies.generate_trials(
32 |         GRID_SEARCH, FLAT_PARAMS, nb_trials=5)
33 |     assert len(trials) == 5
34 | 
35 | 
36 | def test_random_search():
37 |     trials = strategies.generate_trials(
38 |         RANDOM_SEARCH, FLAT_PARAMS, nb_trials=5)
39 |     assert len(trials) == 5
40 | 
41 | def test_random_search_unbounded_error():
42 |     with pytest.raises(TypeError):
43 |         trials = strategies.generate_trials(
44 |             RANDOM_SEARCH, FLAT_PARAMS, nb_trials=None)
45 | 
46 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | data/
  6 | test_tube_data/
  7 | *.experiment
  8 | test.py
  9 | example.json
 10 | .pytest_cache/
 11 | talk/
 12 | .DS_Store
 13 | 
 14 | # C extensions
 15 | *.so
 16 | 
 17 | src
 18 | 
 19 | # Distribution / packaging
 20 | .Python
 21 | env/
 22 | build/
 23 | develop-eggs/
 24 | dist/
 25 | downloads/
 26 | eggs/
 27 | .eggs/
 28 | lib/
 29 | lib64/
 30 | parts/
 31 | sdist/
 32 | var/
 33 | *.egg-info/
 34 | .installed.cfg
 35 | *.egg
 36 | 
 37 | # PyInstaller
 38 | #  Usually these files are written by a python script from a template
 39 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 40 | *.manifest
 41 | *.spec
 42 | 
 43 | # Installer logs
 44 | pip-log.txt
 45 | pip-delete-this-directory.txt
 46 | 
 47 | # Unit test / coverage reports
 48 | htmlcov/
 49 | .tox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *,cover
 56 | .hypothesis/
 57 | 
 58 | .idea
 59 | 
 60 | # Translations
 61 | *.mo
 62 | *.pot
 63 | 
 64 | # Django stuff:
 65 | *.log
 66 | local_settings.py
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # IPython Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # celery beat schedule file
 88 | celerybeat-schedule
 89 | 
 90 | # dotenv
 91 | .env
 92 | 
 93 | # virtualenv
 94 | venv/
 95 | ENV/
 96 | 
 97 | # Spyder project settings
 98 | .spyderproject
 99 | 
100 | # Rope project settings
101 | .ropeproject
102 | 


--------------------------------------------------------------------------------
/examples/tensorflow_example.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from test_tube import Experiment, HyperOptArgumentParser
 4 | 
 5 | """
 6 | This script demonstrates how to do a hyperparameter search over 2 parameters in tensorflow
 7 | on 4 simultaneous GPUs. Each trial will also save its own experiment logs.   
 8 | 
 9 | A single trial gets allocated on a single GPU until all trials have completed.   
10 | This means for 10 trials and 4 GPUs, we'll run 4 in parallel twice and the last 2 trials in parallel.   
11 | """
12 | 
13 | 
14 | # main training function (very simple)
15 | def train(hparams):
16 |     # init exp and track all the parameters from the HyperOptArgumentParser
17 |     exp = Experiment(
18 |         name=hparams.test_tube_exp_name,
19 |         save_dir=hparams.log_path,
20 |         autosave=False,
21 |     )
22 |     exp.argparse(hparams)
23 | 
24 |     # define tensorflow graph
25 |     x = tf.placeholder(dtype=tf.int32, name='x')
26 |     y = tf.placeholder(dtype=tf.int32, name='y')
27 |     out = x * y
28 | 
29 |     sess = tf.Session()
30 | 
31 |     # Run the tf op
32 |     for train_step in range(0, 100):
33 |         output = sess.run(out, feed_dict={x: hparams.x_val, y: hparams.y_val})
34 |         exp.log({'fake_err': output})
35 | 
36 |     # save exp when we're done
37 |     exp.save()
38 | 
39 | 
40 | # set up our argparser and make the y_val tunable
41 | parser = HyperOptArgumentParser(strategy='random_search')
42 | parser.add_argument('--test_tube_exp_name', default='my_test')
43 | parser.add_argument('--log_path', default='/Users/waf/Desktop/test')
44 | parser.opt_list('--y_val', default=12, options=[1, 2, 3, 4], tunable=True)
45 | parser.opt_list('--x_val', default=12, options=[20, 12, 30, 45], tunable=True)
46 | hyperparams = parser.parse_args()
47 | 
48 | 
49 | # optimize on 4 gpus at the same time
50 | # each gpu will get 1 experiment with a set of hyperparams
51 | hyperparams.optimize_parallel_gpu(train, gpu_ids=['1', '0', '3', '2'], nb_trials=4, nb_workers=4)
52 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Test Tube: Easily log and tune Deep Learning experiments
 2 | 
 3 | Test Tube allows you to easily log metadata and track your machine
 4 | learning experiments.
 5 | 
 6 | Use Test Tube if you need to:
 7 | 
 8 | -   Track many [Experiments](experiment_tracking/experiment.md) across
 9 |     models.
10 | -   Visualize and compare different
11 |     experiments without uploading anywhere.
12 | -   [Optimize your
13 |     hyperparameters](hyperparameter_optimization/HyperOptArgumentParser/)
14 |     using grid search or random search.
15 | -   Automatically track ALL parameters for a particular training run.
16 | 
17 | Test Tube is compatible with: Python 2 and 3
18 | 
19 | ## Getting started
20 | 
21 | ------------------------------------------------------------------------
22 | 
23 | ### Create an [Experiment](experiment_tracking/experiment.md)
24 | 
25 | ``` {.python}
26 | from test_tube import Experiment
27 | 
28 | exp = Experiment(name='dense_model',
29 |                  debug=False,
30 |                  save_dir='/Desktop/test_tube')
31 | 
32 | exp.tag({'learning_rate': 0.002, 'nb_layers': 2})
33 | 
34 | for step in training_steps:
35 |     tng_err = model.eval(tng_x, tng_y)
36 | 
37 |     exp.log('tng_err': tng_err)
38 | 
39 | # training complete!
40 | # all your logs and data are ready to be visualized at testtube.williamfalcon.com
41 | ```
42 | 
43 | ------------------------------------------------------------------------
44 | 
45 | ### Optimize your [hyperparameters](hyperparameter_optimization/HyperOptArgumentParser/)
46 | 
47 | ``` {.python}
48 | from test_tube import HyperOptArgumentParser
49 | 
50 | # subclass of argparse
51 | parser = HyperOptArgumentParser(strategy='random_search')
52 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
53 | 
54 | # let's enable optimizing over the number of layers in the network
55 | parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
56 | 
57 | # and tune the number of units in each layer
58 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=10)
59 | 
60 | # compile (because it's argparse underneath)
61 | hparams = parser.parse_args()
62 | 
63 | # run 20 trials of random search over the hyperparams
64 | for hparam_trial in hparams.trials(20):
65 |     train_network(hparam_trial)
66 | ```
67 | 
68 | ------------------------------------------------------------------------
69 | 
70 | ### Visualize
71 | 
72 | ``` {.python}
73 | import pandas as pd
74 | import matplotlib
75 | 
76 | # each experiment is saved to a metrics.csv file which can be imported anywhere
77 | # images save to exp/version/images
78 | df = pd.read_csv('../some/dir/test_tube_data/dense_model/version_0/metrics.csv')
79 | df.tng_err.plot()
80 | ```
81 | 


--------------------------------------------------------------------------------
/examples/pytorch_hpc_example.py:
--------------------------------------------------------------------------------
 1 | """Example launcher for a hyperparameter search on SLURM.
 2 | 
 3 | This example shows how to use gpus on SLURM with PyTorch.
 4 | """
 5 | import torch
 6 | 
 7 | from test_tube import Experiment, HyperOptArgumentParser, SlurmCluster
 8 | 
 9 | 
10 | def train(hparams, *args):
11 |     """Train your awesome model.
12 | 
13 |     :param hparams: The arguments to run the model with.
14 |     """
15 |     # Initialize experiments and track all the hyperparameters
16 |     exp = Experiment(
17 |         name=hparams.test_tube_exp_name,
18 |         # Location to save the metrics.
19 |         save_dir=hparams.log_path,
20 |         autosave=False,
21 |     )
22 |     exp.argparse(hparams)
23 | 
24 |     # Pretend to train.
25 |     x = torch.rand((1, hparams.x_val))
26 |     for train_step in range(0, 100):
27 |         y = torch.rand((hparams.x_val, 1))
28 |         out = x.mm(y)
29 |         exp.log({'fake_err': out.item()})
30 | 
31 |     # Save exp when .
32 |     exp.save()
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     # Set up our argparser and make the y_val tunable.
37 |     parser = HyperOptArgumentParser(strategy='random_search')
38 |     parser.add_argument('--test_tube_exp_name', default='my_test')
39 |     parser.add_argument('--log_path', default='/some/path/to/log')
40 |     parser.opt_list('--y_val',
41 |         default=12, options=[1, 2, 3, 4, 5, 6], tunable=True)
42 |     parser.opt_list('--x_val',
43 |         default=12, options=[20, 12, 30, 45], tunable=True)
44 |     hyperparams = parser.parse_args()
45 | 
46 |     # Enable cluster training.
47 |     cluster = SlurmCluster(
48 |         hyperparam_optimizer=hyperparams,
49 |         log_path=hyperparams.log_path,
50 |         python_cmd='python3',
51 |         test_tube_exp_name=hyperparams.test_tube_exp_name
52 |     )
53 | 
54 |     # Email results if your hpc supports it.
55 |     cluster.notify_job_status(
56 |         email='some@email.com', on_done=True, on_fail=True)
57 | 
58 |     # SLURM Module to load.
59 |     cluster.load_modules([
60 |         'python-3',
61 |         'anaconda3'
62 |     ])
63 | 
64 |     # Add commands to the non-SLURM portion.
65 |     cluster.add_command('source activate myCondaEnv')
66 | 
67 |     # Add custom SLURM commands which show up as:
68 |     # #comment
69 |     # #SBATCH --cmd=value
70 |     # ############
71 |     # cluster.add_slurm_cmd(
72 |     #    cmd='cpus-per-task', value='1', comment='CPUS per task.')
73 | 
74 |     # Set job compute details (this will apply PER set of hyperparameters.)
75 |     cluster.per_experiment_nb_gpus = 4
76 |     cluster.per_experiment_nb_nodes = 2
77 |     cluster.gpu_type = '1080ti'
78 | 
79 |     # Each hyperparameter combination will use 8 gpus.
80 |     cluster.optimize_parallel_cluster_gpu(
81 |         # Function to execute:
82 |         train,
83 |         # Number of hyperparameter combinations to search:
84 |         nb_trials=24,
85 |         # This is what will display in the slurm queue:
86 |         job_name='first_tt_job')
87 | 


--------------------------------------------------------------------------------
/examples/hpc_cpu_example.py:
--------------------------------------------------------------------------------
 1 | """Example launcher for a hyperparameter search on SLURM."""
 2 | from test_tube import Experiment, HyperOptArgumentParser, SlurmCluster
 3 | 
 4 | 
 5 | def train(hparams, *args):
 6 |     """Train your awesome model.
 7 | 
 8 |     :param hparams: The arguments to run the model with.
 9 |     """
10 |     # Initialize experiments and track all the hyperparameters
11 |     exp = Experiment(
12 |         name=hparams.test_tube_exp_name,
13 |         # Location to save the metrics.
14 |         save_dir=hparams.log_path,
15 |         # The experiment version is optional, but using the one 
16 |         # from SLURM means the exp will not collide with other
17 |         # versions if SLURM runs multiple at once.
18 |         version=hparams.hpc_exp_number,
19 |         autosave=False,
20 |     )
21 |     exp.argparse(hparams)
22 | 
23 |     # Pretend to train.
24 |     x = hparams.x_val
25 |     for train_step in range(0, 100):
26 |         y = hparams.y_val
27 |         out = x * y
28 |         exp.log({'fake_err': out.item()})  # Log metrics.
29 | 
30 |     # Save exp when done.
31 |     exp.save()
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     # Set up our argparser and make the y_val tunable.
36 |     parser = HyperOptArgumentParser(strategy='random_search')
37 |     parser.add_argument('--test_tube_exp_name', default='my_test')
38 |     parser.add_argument('--log_path', default='/some/path/to/log')
39 |     parser.opt_list('--y_val',
40 |         default=12, options=[1, 2, 3, 4, 5, 6], tunable=True)
41 |     parser.opt_list('--x_val',
42 |         default=12, options=[20, 12, 30, 45], tunable=True)
43 |     hyperparams = parser.parse_args()
44 | 
45 |     # Enable cluster training.
46 |     cluster = SlurmCluster(
47 |         hyperparam_optimizer=hyperparams,
48 |         log_path=hyperparams.log_path,
49 |         python_cmd='python3',
50 |         test_tube_exp_name=hyperparams.test_tube_exp_name
51 |     )
52 | 
53 |     # Email results if your hpc supports it.
54 |     cluster.notify_job_status(
55 |         email='some@email.com', on_done=True, on_fail=True)
56 | 
57 |     # SLURM Module to load.
58 |     cluster.load_modules([
59 |         'python-3',
60 |         'anaconda3'
61 |     ])
62 | 
63 |     # Add commands to the non-SLURM portion.
64 |     cluster.add_command('source activate myCondaEnv')
65 | 
66 |     # Add custom SLURM commands which show up as:
67 |     # #comment
68 |     # #SBATCH --cmd=value
69 |     # ############
70 |     # cluster.add_slurm_cmd(
71 |     #    cmd='cpus-per-task', value='1', comment='CPUS per task.')
72 | 
73 |     # Set job compute details (this will apply PER set of hyperparameters.)
74 |     cluster.per_experiment_nb_cpus = 20
75 |     cluster.per_experiment_nb_nodes = 10
76 | 
77 |     # Each hyperparameter combination will use 200 cpus.
78 |     cluster.optimize_parallel_cluster_cpu(
79 |         # Function to execute:
80 |         train,
81 |         # Number of hyperparameter combinations to search:
82 |         nb_trials=24,
83 |         job_name='first_tt_job',
84 |         # This is what will display in the slurm queue:
85 |         job_display_name='short_name')
86 | 


--------------------------------------------------------------------------------
/site/search/main.js:
--------------------------------------------------------------------------------
 1 | function getSearchTermFromLocation() {
 2 |   var sPageURL = window.location.search.substring(1);
 3 |   var sURLVariables = sPageURL.split('&');
 4 |   for (var i = 0; i < sURLVariables.length; i++) {
 5 |     var sParameterName = sURLVariables[i].split('=');
 6 |     if (sParameterName[0] == 'q') {
 7 |       return decodeURIComponent(sParameterName[1].replace(/\+/g, '%20'));
 8 |     }
 9 |   }
10 | }
11 | 
12 | function joinUrl (base, path) {
13 |   if (path.substring(0, 1) === "/") {
14 |     // path starts with `/`. Thus it is absolute.
15 |     return path;
16 |   }
17 |   if (base.substring(base.length-1) === "/") {
18 |     // base ends with `/`
19 |     return base + path;
20 |   }
21 |   return base + "/" + path;
22 | }
23 | 
24 | function formatResult (location, title, summary) {
25 |   return '<article><h3><a href="' + joinUrl(base_url, location) + '">'+ title + '</a></h3><p>' + summary +'</p></article>';
26 | }
27 | 
28 | function displayResults (results) {
29 |   var search_results = document.getElementById("mkdocs-search-results");
30 |   while (search_results.firstChild) {
31 |     search_results.removeChild(search_results.firstChild);
32 |   }
33 |   if (results.length > 0){
34 |     for (var i=0; i < results.length; i++){
35 |       var result = results[i];
36 |       var html = formatResult(result.location, result.title, result.summary);
37 |       search_results.insertAdjacentHTML('beforeend', html);
38 |     }
39 |   } else {
40 |     search_results.insertAdjacentHTML('beforeend', "<p>No results found</p>");
41 |   }
42 | }
43 | 
44 | function doSearch () {
45 |   var query = document.getElementById('mkdocs-search-query').value;
46 |   if (query.length > 2) {
47 |     if (!window.Worker) {
48 |       displayResults(search(query));
49 |     } else {
50 |       searchWorker.postMessage({query: query});
51 |     }
52 |   } else {
53 |     // Clear results for short queries
54 |     displayResults([]);
55 |   }
56 | }
57 | 
58 | function initSearch () {
59 |   var search_input = document.getElementById('mkdocs-search-query');
60 |   if (search_input) {
61 |     search_input.addEventListener("keyup", doSearch);
62 |   }
63 |   var term = getSearchTermFromLocation();
64 |   if (term) {
65 |     search_input.value = term;
66 |     doSearch();
67 |   }
68 | }
69 | 
70 | function onWorkerMessage (e) {
71 |   if (e.data.allowSearch) {
72 |     initSearch();
73 |   } else if (e.data.results) {
74 |     var results = e.data.results;
75 |     displayResults(results);
76 |   }
77 | }
78 | 
79 | if (!window.Worker) {
80 |   console.log('Web Worker API not supported');
81 |   // load index in main thread
82 |   $.getScript(joinUrl(base_url, "search/worker.js")).done(function () {
83 |     console.log('Loaded worker');
84 |     init();
85 |     window.postMessage = function (msg) {
86 |       onWorkerMessage({data: msg});
87 |     };
88 |   }).fail(function (jqxhr, settings, exception) {
89 |     console.error('Could not load worker.js');
90 |   });
91 | } else {
92 |   // Wrap search in a web worker
93 |   var searchWorker = new Worker(joinUrl(base_url, "search/worker.js"));
94 |   searchWorker.postMessage({init: true});
95 |   searchWorker.onmessage = onWorkerMessage;
96 | }
97 | 


--------------------------------------------------------------------------------
/test_tube/hyper_opt_utils/strategies.py:
--------------------------------------------------------------------------------
 1 | """Hyperparameter search strategies."""
 2 | import itertools
 3 | import json
 4 | import random
 5 | 
 6 | 
 7 | def generate_trials(strategy, flat_params, nb_trials=None):
 8 |     r"""Generates the parameter combinations to search.
 9 | 
10 |     Two search strategies are implemented:
11 |     1. `grid_search`: creates a search space that consists of the
12 |         product of all flat_params. If `nb_trials` is specified
13 |         the first `nb_trials` combinations are searched.
14 |     2. `random_search`: Creates random combinations of the
15 |         hyperparameters. Can be used for a more efficient search.
16 |         See (Bergstra and Bengio, 2012) for more details.
17 | 
18 |     :param strategy: The hyperparameter search to strategy. Can be
19 |         one of: {`grid_search`, `random`}.
20 |     :param flat_params: The hyperparameter arguments to iterate over.
21 |     :param nb_trials: The number of hyperparameter combinations to try.
22 |     Generates the parameter combinations for each requested trial
23 |     :param strategy:
24 |     :param flat_params:
25 |     :param nb_trials: The number of trials to un.
26 |     :return:
27 |     """
28 |     if strategy == 'grid_search':
29 |         trials = generate_grid_search_trials(flat_params, nb_trials)
30 |         return trials
31 |     elif strategy == 'random_search':
32 |         trials = generate_random_search_trials(flat_params, nb_trials)
33 |         return trials
34 |     else:
35 |         raise ValueError(
36 |             ('Unknown strategy "{}". Must be one of '
37 |              '{{grid_search, random_search}}').format(strategy))
38 | 
39 | 
40 | def generate_grid_search_trials(flat_params, nb_trials):
41 |     """
42 |     Standard grid search. Takes the product of `flat_params`
43 |     to generate the search space.
44 | 
45 |     :param params: The hyperparameters options to search.
46 |     :param nb_trials: Returns the first `nb_trials` from the
47 |         combinations space. If this is None, all combinations
48 |         are returned.
49 |     :return: A dict containing the hyperparameters.
50 |     """
51 |     trials = list(itertools.product(*flat_params))
52 |     if nb_trials:
53 |         trials = trials[0:nb_trials]
54 |     return trials
55 | 
56 | 
57 | def generate_random_search_trials(params, nb_trials):
58 |     """
59 |     Generates random combination of hyperparameters to try.
60 |     See (Bergstra and Bengio, 2012) for more details.
61 | 
62 |     :param params: The hyperparameters options to search.
63 |     :param nb_trials: The number of trials to run.
64 |     :return: A dict containing the hyperparameters.
65 |     """
66 |     if nb_trials is None:
67 |         raise TypeError(
68 |             '`random_search` strategy requires nb_trails to be an int.')
69 |     results = []
70 | 
71 |     # ensures we have unique results
72 |     seen_trials = set()
73 | 
74 |     # shuffle each param list
75 |     potential_trials = 1
76 |     for param in params:
77 |         random.shuffle(param)
78 |         potential_trials *= len(param)
79 | 
80 |     # we can't sample more trials than are possible
81 |     max_iters = min(potential_trials, nb_trials)
82 | 
83 |     # then for the nb of trials requested, create a new param tuple
84 |     # by picking a random integer at each param level
85 |     while len(results) < max_iters:
86 |         trial = []
87 |         for param in params:
88 |             sampled_param = random.sample(param, 1)[0]
89 |             trial.append(sampled_param)
90 | 
91 |         # verify this is a unique trial so we
92 |         # don't duplicate work
93 |         trial_str = json.dumps(trial)
94 |         if trial_str not in seen_trials:
95 |             seen_trials.add(trial_str)
96 |             results.append(trial)
97 | 
98 |     return results
99 | 


--------------------------------------------------------------------------------
/site/js/theme.js:
--------------------------------------------------------------------------------
  1 | $( document ).ready(function() {
  2 |     // Shift nav in mobile when clicking the menu.
  3 |     $(document).on('click', "[data-toggle='wy-nav-top']", function() {
  4 |       $("[data-toggle='wy-nav-shift']").toggleClass("shift");
  5 |       $("[data-toggle='rst-versions']").toggleClass("shift");
  6 |     });
  7 | 
  8 |     // Close menu when you click a link.
  9 |     $(document).on('click', ".wy-menu-vertical .current ul li a", function() {
 10 |       $("[data-toggle='wy-nav-shift']").removeClass("shift");
 11 |       $("[data-toggle='rst-versions']").toggleClass("shift");
 12 |     });
 13 | 
 14 |     // Keyboard navigation
 15 |     document.addEventListener("keydown", function(e) {
 16 |       var key = e.which || e.keyCode || window.event && window.event.keyCode;
 17 |       var page;
 18 |       switch (key) {
 19 |           case 78:  // n
 20 |               page = $('[role="navigation"] a:contains(Next):first').prop('href');
 21 |               break;
 22 |           case 80:  // p
 23 |               page = $('[role="navigation"] a:contains(Previous):first').prop('href');
 24 |               break;
 25 |           case 13:  // enter
 26 |               if (e.target === document.getElementById('mkdocs-search-query')) {
 27 |                 e.preventDefault();
 28 |               }
 29 |               break;
 30 |           default: break;
 31 |       }
 32 |       if ($(e.target).is(':input')) {
 33 |         return true;
 34 |       } else if (page) {
 35 |         window.location.href = page;
 36 |       }
 37 |     });
 38 | 
 39 |     $(document).on('click', "[data-toggle='rst-current-version']", function() {
 40 |       $("[data-toggle='rst-versions']").toggleClass("shift-up");
 41 |     });
 42 | 
 43 |     // Make tables responsive
 44 |     $("table.docutils:not(.field-list)").wrap("<div class='wy-table-responsive'></div>");
 45 | 
 46 |     $('table').addClass('docutils');
 47 | });
 48 | 
 49 | window.SphinxRtdTheme = (function (jquery) {
 50 |     var stickyNav = (function () {
 51 |         var navBar,
 52 |             win,
 53 |             stickyNavCssClass = 'stickynav',
 54 |             applyStickNav = function () {
 55 |                 if (navBar.height() <= win.height()) {
 56 |                     navBar.addClass(stickyNavCssClass);
 57 |                 } else {
 58 |                     navBar.removeClass(stickyNavCssClass);
 59 |                 }
 60 |             },
 61 |             enable = function () {
 62 |                 applyStickNav();
 63 |                 win.on('resize', applyStickNav);
 64 |             },
 65 |             init = function () {
 66 |                 navBar = jquery('nav.wy-nav-side:first');
 67 |                 win    = jquery(window);
 68 |             };
 69 |         jquery(init);
 70 |         return {
 71 |             enable : enable
 72 |         };
 73 |     }());
 74 |     return {
 75 |         StickyNav : stickyNav
 76 |     };
 77 | }($));
 78 | 
 79 | // The code below is a copy of @seanmadsen code posted Jan 10, 2017 on issue 803.
 80 | // https://github.com/mkdocs/mkdocs/issues/803
 81 | // This just incorporates the auto scroll into the theme itself without
 82 | // the need for additional custom.js file.
 83 | //
 84 | $(function() {
 85 |   $.fn.isFullyWithinViewport = function(){
 86 |       var viewport = {};
 87 |       viewport.top = $(window).scrollTop();
 88 |       viewport.bottom = viewport.top + $(window).height();
 89 |       var bounds = {};
 90 |       bounds.top = this.offset().top;
 91 |       bounds.bottom = bounds.top + this.outerHeight();
 92 |       return ( ! (
 93 |         (bounds.top <= viewport.top) ||
 94 |         (bounds.bottom >= viewport.bottom)
 95 |       ) );
 96 |   };
 97 |   if( $('li.toctree-l1.current').length && !$('li.toctree-l1.current').isFullyWithinViewport() ) {
 98 |     $('.wy-nav-side')
 99 |       .scrollTop(
100 |         $('li.toctree-l1.current').offset().top -
101 |         $('.wy-nav-side').offset().top -
102 |         60
103 |       );
104 |   }
105 | });
106 | 


--------------------------------------------------------------------------------
/site/search/worker.js:
--------------------------------------------------------------------------------
  1 | var base_path = 'function' === typeof importScripts ? '.' : '/search/';
  2 | var allowSearch = false;
  3 | var index;
  4 | var documents = {};
  5 | var lang = ['en'];
  6 | var data;
  7 | 
  8 | function getScript(script, callback) {
  9 |   console.log('Loading script: ' + script);
 10 |   $.getScript(base_path + script).done(function () {
 11 |     callback();
 12 |   }).fail(function (jqxhr, settings, exception) {
 13 |     console.log('Error: ' + exception);
 14 |   });
 15 | }
 16 | 
 17 | function getScriptsInOrder(scripts, callback) {
 18 |   if (scripts.length === 0) {
 19 |     callback();
 20 |     return;
 21 |   }
 22 |   getScript(scripts[0], function() {
 23 |     getScriptsInOrder(scripts.slice(1), callback);
 24 |   });
 25 | }
 26 | 
 27 | function loadScripts(urls, callback) {
 28 |   if( 'function' === typeof importScripts ) {
 29 |     importScripts.apply(null, urls);
 30 |     callback();
 31 |   } else {
 32 |     getScriptsInOrder(urls, callback);
 33 |   }
 34 | }
 35 | 
 36 | function onJSONLoaded () {
 37 |   data = JSON.parse(this.responseText);
 38 |   var scriptsToLoad = ['lunr.js'];
 39 |   if (data.config && data.config.lang && data.config.lang.length) {
 40 |     lang = data.config.lang;
 41 |   }
 42 |   if (lang.length > 1 || lang[0] !== "en") {
 43 |     scriptsToLoad.push('lunr.stemmer.support.js');
 44 |     if (lang.length > 1) {
 45 |       scriptsToLoad.push('lunr.multi.js');
 46 |     }
 47 |     for (var i=0; i < lang.length; i++) {
 48 |       if (lang[i] != 'en') {
 49 |         scriptsToLoad.push(['lunr', lang[i], 'js'].join('.'));
 50 |       }
 51 |     }
 52 |   }
 53 |   loadScripts(scriptsToLoad, onScriptsLoaded);
 54 | }
 55 | 
 56 | function onScriptsLoaded () {
 57 |   console.log('All search scripts loaded, building Lunr index...');
 58 |   if (data.config && data.config.separator && data.config.separator.length) {
 59 |     lunr.tokenizer.separator = new RegExp(data.config.separator);
 60 |   }
 61 |   if (data.index) {
 62 |     index = lunr.Index.load(data.index);
 63 |     data.docs.forEach(function (doc) {
 64 |       documents[doc.location] = doc;
 65 |     });
 66 |     console.log('Lunr pre-built index loaded, search ready');
 67 |   } else {
 68 |     index = lunr(function () {
 69 |       if (lang.length === 1 && lang[0] !== "en" && lunr[lang[0]]) {
 70 |         this.use(lunr[lang[0]]);
 71 |       } else if (lang.length > 1) {
 72 |         this.use(lunr.multiLanguage.apply(null, lang));  // spread operator not supported in all browsers: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Spread_operator#Browser_compatibility
 73 |       }
 74 |       this.field('title');
 75 |       this.field('text');
 76 |       this.ref('location');
 77 | 
 78 |       for (var i=0; i < data.docs.length; i++) {
 79 |         var doc = data.docs[i];
 80 |         this.add(doc);
 81 |         documents[doc.location] = doc;
 82 |       }
 83 |     });
 84 |     console.log('Lunr index built, search ready');
 85 |   }
 86 |   allowSearch = true;
 87 |   postMessage({allowSearch: allowSearch});
 88 | }
 89 | 
 90 | function init () {
 91 |   var oReq = new XMLHttpRequest();
 92 |   oReq.addEventListener("load", onJSONLoaded);
 93 |   var index_path = base_path + '/search_index.json';
 94 |   if( 'function' === typeof importScripts ){
 95 |       index_path = 'search_index.json';
 96 |   }
 97 |   oReq.open("GET", index_path);
 98 |   oReq.send();
 99 | }
100 | 
101 | function search (query) {
102 |   if (!allowSearch) {
103 |     console.error('Assets for search still loading');
104 |     return;
105 |   }
106 | 
107 |   var resultDocuments = [];
108 |   var results = index.search(query);
109 |   for (var i=0; i < results.length; i++){
110 |     var result = results[i];
111 |     doc = documents[result.ref];
112 |     doc.summary = doc.text.substring(0, 200);
113 |     resultDocuments.push(doc);
114 |   }
115 |   return resultDocuments;
116 | }
117 | 
118 | if( 'function' === typeof importScripts ) {
119 |   onmessage = function (e) {
120 |     if (e.data.init) {
121 |       init();
122 |     } else if (e.data.query) {
123 |       postMessage({ results: search(e.data.query) });
124 |     } else {
125 |       console.error("Worker - Unrecognized message: " + e);
126 |     }
127 |   };
128 | }
129 | 


--------------------------------------------------------------------------------
/docs/experiment_tracking/experiment.md:
--------------------------------------------------------------------------------
  1 | # Experiment class API   
  2 | 
  3 | [[Github Code](https://github.com/williamFalcon/test-tube/blob/master/test_tube/log.py)]    
  4 | 
  5 | An Experiment holds metadata and the results of the training run, you
  6 | can instantiate an `Experiment` via:
  7 | 
  8 | ``` {.python}
  9 | from test_tube import Experiment
 10 | 
 11 | exp = Experiment(name='dense_model',
 12 |                  debug=False,
 13 |                  save_dir='/Desktop/test_tube')
 14 | 
 15 | exp.tag({'learning_rate': 0.002, 'nb_layers': 2})
 16 | 
 17 | for step in training_steps:
 18 |     tng_err = model.eval(tng_x, tng_y)
 19 | 
 20 |     exp.log({'tng_err': tng_err})
 21 | 
 22 | # training complete!
 23 | # all your logs and data are ready to be visualized at testtube.williamfalcon.com
 24 | ```
 25 | 
 26 | ------------------------------------------------------------------------
 27 | 
 28 | ## init options
 29 | 
 30 | ### version
 31 | 
 32 | The same Experiment can have multiple versions. Test tube generates
 33 | these automatically each time you run your model. To set your own
 34 | version use:
 35 | 
 36 | ``` {.python}
 37 | exp = Experiment(name='dense_model',version=1)
 38 | ```
 39 | 
 40 | ### debug
 41 | 
 42 | If you're debugging and don't want to create a log file turn debug to
 43 | True
 44 | 
 45 | ``` {.python}
 46 | exp = Experiment(name='dense_model',debug=True)
 47 | ```
 48 | 
 49 | ### autosave
 50 | 
 51 | If you only want to save at the end of training, turn autosave off:
 52 | 
 53 | ``` {.python}
 54 | exp = Experiment(name='dense_model', autosave=False)
 55 | 
 56 | # run long training...
 57 | 
 58 | # first time any logs are saved
 59 | exp.save()
 60 | ```
 61 | 
 62 | ### `create_git_tag`
 63 | 
 64 | Ever wanted a flashback to your code when you ran an experiment?
 65 | Snapshot your code for this experiment using git tags:
 66 | 
 67 | ``` {.python}
 68 | exp = Experiment(name='dense_model', create_git_tag=True)
 69 | ```
 70 | 
 71 | ------------------------------------------------------------------------
 72 | 
 73 | ## Methods
 74 | 
 75 | ### tag
 76 | 
 77 | ``` {.python}
 78 | exp.tag({k: v})
 79 | ```
 80 | 
 81 | Adds an arbitrary dictionary of tags to the experiment
 82 | 
 83 | **Example**
 84 | 
 85 | ``` {.python}
 86 | exp.tag({'dataset_name': 'imagenet_1', 'learning_rate': 0.0002})
 87 | ```
 88 | 
 89 | ### log
 90 | 
 91 | ``` {.python}
 92 | exp.log({k:v})
 93 | ```
 94 | 
 95 | Adds a row of data to the experiments
 96 | 
 97 | **Example**
 98 | 
 99 | ``` {.python}
100 | exp.log({'val_loss': 0.22, 'epoch_nb': 1, 'batch_nb': 12})
101 | 
102 | # you can also add other rows that have separate information
103 | exp.log({'tng_loss': 0.01})
104 | 
105 | # or even a numpy array image
106 | image = np.imread('image.png')
107 | exp.log({'fake_png': image})
108 | ```
109 | 
110 | **Saving images Example**
111 | 
112 | ``` {.python}
113 | # name must have either jpg, png or jpeg in it
114 | img = np.imread('a.jpg')
115 | exp.log('test_jpg': img, 'val_err': 0.2)
116 | 
117 | # saves image to ../exp/version/media/test_0.jpg
118 | # csv has file path to that image in that cell
119 | ```
120 | 
121 | To save an image, add `jpg`, `png` or `jpeg` to the key corresponding
122 | with the image array. The image must be formatted the same as skimage's
123 | [imsave](http://scikit-image.org/docs/dev/api/skimage.io.html#skimage.io.imsave)
124 | function
125 | 
126 | ### argparse
127 | 
128 | ``` {.python}
129 | exp.argparse(hparams)
130 | ```
131 | 
132 | Transfers hyperparam information from Argparser or
133 | HyperOptArgumentParser
134 | 
135 | **Example**
136 | 
137 | ``` {.python}
138 | from test_tube import HyperOptArgumentParser
139 | 
140 | # parse args
141 | parser = HyperOptArgumentParser()
142 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
143 | hparams = parser.parse_args()
144 | 
145 | # learning_rate is now a meta tag for your experiment
146 | exp.argparse(hparams)
147 | ```
148 | 
149 | ### save
150 | 
151 | ``` {.python}
152 | exp.save()
153 | ```
154 | 
155 | Saves the exp to disk (including images)
156 | 
157 | **Example**
158 | 
159 | ``` {.python}
160 | exp = Experiment(name='dense_model', autosave=False)
161 | 
162 | # run long training...
163 | 
164 | # first time any logs are saved
165 | exp.save()
166 | ```
167 | 


--------------------------------------------------------------------------------
/site/css/theme_extra.css:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Sphinx doesn't have support for section dividers like we do in
  3 |  * MkDocs, this styles the section titles in the nav
  4 |  *
  5 |  * https://github.com/mkdocs/mkdocs/issues/175
  6 |  */
  7 | .wy-menu-vertical span {
  8 |     line-height: 18px;
  9 |     padding: 0.4045em 1.618em;
 10 |     display: block;
 11 |     position: relative;
 12 |     font-size: 90%;
 13 |     color: #838383;
 14 | }
 15 | 
 16 | .wy-menu-vertical .subnav a {
 17 |     padding: 0.4045em 2.427em;
 18 | }
 19 | 
 20 | /*
 21 |  * Long navigations run off the bottom of the screen as the nav
 22 |  * area doesn't scroll.
 23 |  *
 24 |  * https://github.com/mkdocs/mkdocs/pull/202
 25 |  *
 26 |  * Builds upon pull 202 https://github.com/mkdocs/mkdocs/pull/202
 27 |  * to make toc scrollbar end before navigations buttons to not be overlapping.
 28 |  */
 29 | .wy-nav-side {
 30 |     height: calc(100% - 45px);
 31 |     overflow-y: auto;
 32 |     min-height: 0;
 33 | }
 34 | 
 35 | .rst-versions{
 36 |     border-top: 0;
 37 |     height: 45px;
 38 | }
 39 | 
 40 | @media screen and (max-width: 768px) {
 41 |     .wy-nav-side {
 42 |         height: 100%;
 43 |     }
 44 | }
 45 | 
 46 | /*
 47 |  * readthedocs theme hides nav items when the window height is
 48 |  * too small to contain them.
 49 |  *
 50 |  * https://github.com/mkdocs/mkdocs/issues/#348
 51 |  */
 52 | .wy-menu-vertical ul {
 53 |   margin-bottom: 2em;
 54 | }
 55 | 
 56 | /*
 57 |  * Wrap inline code samples otherwise they shoot of the side and
 58 |  * can't be read at all.
 59 |  *
 60 |  * https://github.com/mkdocs/mkdocs/issues/313
 61 |  * https://github.com/mkdocs/mkdocs/issues/233
 62 |  * https://github.com/mkdocs/mkdocs/issues/834
 63 |  */
 64 | code {
 65 |     white-space: pre-wrap;
 66 |     word-wrap: break-word;
 67 |     padding: 2px 5px;
 68 | }
 69 | 
 70 | /**
 71 |  * Make code blocks display as blocks and give them the appropriate
 72 |  * font size and padding.
 73 |  *
 74 |  * https://github.com/mkdocs/mkdocs/issues/855
 75 |  * https://github.com/mkdocs/mkdocs/issues/834
 76 |  * https://github.com/mkdocs/mkdocs/issues/233
 77 |  */
 78 | pre code {
 79 |   white-space: pre;
 80 |   word-wrap: normal;
 81 |   display: block;
 82 |   padding: 12px;
 83 |   font-size: 12px;
 84 | }
 85 | 
 86 | /*
 87 |  * Fix link colors when the link text is inline code.
 88 |  *
 89 |  * https://github.com/mkdocs/mkdocs/issues/718
 90 |  */
 91 | a code {
 92 |     color: #2980B9;
 93 | }
 94 | a:hover code {
 95 |     color: #3091d1;
 96 | }
 97 | a:visited code {
 98 |     color: #9B59B6;
 99 | }
100 | 
101 | /*
102 |  * The CSS classes from highlight.js seem to clash with the
103 |  * ReadTheDocs theme causing some code to be incorrectly made
104 |  * bold and italic.
105 |  *
106 |  * https://github.com/mkdocs/mkdocs/issues/411
107 |  */
108 | pre .cs, pre .c {
109 |     font-weight: inherit;
110 |     font-style: inherit;
111 | }
112 | 
113 | /*
114 |  * Fix some issues with the theme and non-highlighted code
115 |  * samples. Without and highlighting styles attached the
116 |  * formatting is broken.
117 |  *
118 |  * https://github.com/mkdocs/mkdocs/issues/319
119 |  */
120 | .no-highlight {
121 |   display: block;
122 |   padding: 0.5em;
123 |   color: #333;
124 | }
125 | 
126 | 
127 | /*
128 |  * Additions specific to the search functionality provided by MkDocs
129 |  */
130 | 
131 | .search-results {
132 |     margin-top: 23px;
133 | }
134 | 
135 | .search-results article {
136 |     border-top: 1px solid #E1E4E5;
137 |     padding-top: 24px;
138 | }
139 | 
140 | .search-results article:first-child {
141 |     border-top: none;
142 | }
143 | 
144 | form .search-query {
145 |     width: 100%;
146 |     border-radius: 50px;
147 |     padding: 6px 12px;  /* csslint allow: box-model */
148 |     border-color: #D1D4D5;
149 | }
150 | 
151 | .wy-menu-vertical li ul {
152 |     display: inherit;
153 | }
154 | 
155 | .wy-menu-vertical li ul.subnav ul.subnav{
156 |     padding-left: 1em;
157 | }
158 | 
159 | .wy-menu-vertical .subnav li.current > a {
160 |     padding-left: 2.42em;
161 | }
162 | .wy-menu-vertical .subnav li.current > ul li a {
163 |     padding-left: 3.23em;
164 | }
165 | 
166 | /*
167 |  * Improve inline code blocks within admonitions.
168 |  *
169 |  * https://github.com/mkdocs/mkdocs/issues/656
170 |  */
171 |  .admonition code {
172 |   color: #404040;
173 |   border: 1px solid #c7c9cb;
174 |   border: 1px solid rgba(0, 0, 0, 0.2);
175 |   background: #f8fbfd;
176 |   background: rgba(255, 255, 255, 0.7);
177 | }
178 | 
179 | /*
180 |  * Account for wide tables which go off the side.
181 |  * Override borders to avoid wierdness on narrow tables.
182 |  *
183 |  * https://github.com/mkdocs/mkdocs/issues/834
184 |  * https://github.com/mkdocs/mkdocs/pull/1034
185 |  */
186 | .rst-content .section .docutils {
187 |     width: 100%;
188 |     overflow: auto;
189 |     display: block;
190 |     border: none;
191 | }
192 | 
193 | td, th {
194 |    border: 1px solid #e1e4e5 !important; /* csslint allow: important */
195 |    border-collapse: collapse;
196 | }
197 | 
198 | 


--------------------------------------------------------------------------------
/site/404.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
  3 | <!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
  4 | <head>
  5 |   <meta charset="utf-8">
  6 |   <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 |   
  9 |   
 10 |   <link rel="shortcut icon" href="/img/favicon.ico">
 11 |   <title>Test tube Documentation</title>
 12 |   <link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
 13 | 
 14 |   <link rel="stylesheet" href="/css/theme.css" type="text/css" />
 15 |   <link rel="stylesheet" href="/css/theme_extra.css" type="text/css" />
 16 |   <link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/github.min.css">
 17 |   
 18 |   <script src="/js/jquery-2.1.1.min.js" defer></script>
 19 |   <script src="/js/modernizr-2.8.3.min.js" defer></script>
 20 |   <script src="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/highlight.min.js"></script>
 21 |   <script>hljs.initHighlightingOnLoad();</script> 
 22 |   
 23 | </head>
 24 | 
 25 | <body class="wy-body-for-nav" role="document">
 26 | 
 27 |   <div class="wy-grid-for-nav">
 28 | 
 29 |     
 30 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
 31 |       <div class="wy-side-nav-search">
 32 |         <a href="/." class="icon icon-home"> Test tube Documentation</a>
 33 |         <div role="search">
 34 |   <form id ="rtd-search-form" class="wy-form" action="//search.html" method="get">
 35 |     <input type="text" name="q" placeholder="Search docs" title="Type search term here" />
 36 |   </form>
 37 | </div>
 38 |       </div>
 39 | 
 40 |       <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
 41 | 	<ul class="current">
 42 | 	  
 43 |           
 44 |             <li class="toctree-l1">
 45 | 		
 46 |     <a class="" href="/.">Test Tube: Easily log and tune Deep Learning experiments</a>
 47 | 	    </li>
 48 |           
 49 |             <li class="toctree-l1">
 50 | 		
 51 |     <span class="caption-text">Experiment tracking</span>
 52 |     <ul class="subnav">
 53 |                 <li class="">
 54 |                     
 55 |     <a class="" href="/experiment_tracking/experiment/">Experiment class API</a>
 56 |                 </li>
 57 |     </ul>
 58 | 	    </li>
 59 |           
 60 |             <li class="toctree-l1">
 61 | 		
 62 |     <span class="caption-text">Hpc</span>
 63 |     <ul class="subnav">
 64 |                 <li class="">
 65 |                     
 66 |     <a class="" href="/hpc/SlurmCluster/">SlurmCluster class API</a>
 67 |                 </li>
 68 |     </ul>
 69 | 	    </li>
 70 |           
 71 |             <li class="toctree-l1">
 72 | 		
 73 |     <span class="caption-text">Hyperparameter optimization</span>
 74 |     <ul class="subnav">
 75 |                 <li class="">
 76 |                     
 77 |     <a class="" href="/hyperparameter_optimization/HyperOptArgumentParser/">HyperOptArgumentParser class API</a>
 78 |                 </li>
 79 |     </ul>
 80 | 	    </li>
 81 |           
 82 |         </ul>
 83 |       </div>
 84 |       &nbsp;
 85 |     </nav>
 86 | 
 87 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
 88 | 
 89 |       
 90 |       <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
 91 |         <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 92 |         <a href="/.">Test tube Documentation</a>
 93 |       </nav>
 94 | 
 95 |       
 96 |       <div class="wy-nav-content">
 97 |         <div class="rst-content">
 98 |           <div role="navigation" aria-label="breadcrumbs navigation">
 99 |   <ul class="wy-breadcrumbs">
100 |     <li><a href="/.">Docs</a> &raquo;</li>
101 |     
102 |     
103 |     <li class="wy-breadcrumbs-aside">
104 |       
105 |     </li>
106 |   </ul>
107 |   <hr/>
108 | </div>
109 |           <div role="main">
110 |             <div class="section">
111 |               
112 | 
113 |   <h1 id="404-page-not-found">404</h1>
114 | 
115 |   <p><strong>Page not found</strong></p>
116 | 
117 | 
118 |             </div>
119 |           </div>
120 |           <footer>
121 |   
122 | 
123 |   <hr/>
124 | 
125 |   <div role="contentinfo">
126 |     <!-- Copyright etc -->
127 |     
128 |   </div>
129 | 
130 |   Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
131 | </footer>
132 |       
133 |         </div>
134 |       </div>
135 | 
136 |     </section>
137 | 
138 |   </div>
139 | 
140 |   <div class="rst-versions" role="note" style="cursor: pointer">
141 |     <span class="rst-current-version" data-toggle="rst-current-version">
142 |       
143 |           <a href="https://github.com/williamFalcon/test_tube/" class="fa fa-github" style="float: left; color: #fcfcfc"> GitHub</a>
144 |       
145 |       
146 |       
147 |     </span>
148 | </div>
149 |     <script>var base_url = '/';</script>
150 |     <script src="/js/theme.js" defer></script>
151 |       <script src="/search/main.js" defer></script>
152 | 
153 | </body>
154 | </html>
155 | 


--------------------------------------------------------------------------------
/site/search.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
  3 | <!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
  4 | <head>
  5 |   <meta charset="utf-8">
  6 |   <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 |   
  9 |   
 10 |   <link rel="shortcut icon" href="./img/favicon.ico">
 11 |   <title>Test tube Documentation</title>
 12 |   <link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
 13 | 
 14 |   <link rel="stylesheet" href="./css/theme.css" type="text/css" />
 15 |   <link rel="stylesheet" href="./css/theme_extra.css" type="text/css" />
 16 |   <link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/github.min.css">
 17 |   
 18 |   <script src="./js/jquery-2.1.1.min.js" defer></script>
 19 |   <script src="./js/modernizr-2.8.3.min.js" defer></script>
 20 |   <script src="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/highlight.min.js"></script>
 21 |   <script>hljs.initHighlightingOnLoad();</script> 
 22 |   
 23 | </head>
 24 | 
 25 | <body class="wy-body-for-nav" role="document">
 26 | 
 27 |   <div class="wy-grid-for-nav">
 28 | 
 29 |     
 30 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
 31 |       <div class="wy-side-nav-search">
 32 |         <a href="./." class="icon icon-home"> Test tube Documentation</a>
 33 |         <div role="search">
 34 |   <form id ="rtd-search-form" class="wy-form" action="./search.html" method="get">
 35 |     <input type="text" name="q" placeholder="Search docs" title="Type search term here" />
 36 |   </form>
 37 | </div>
 38 |       </div>
 39 | 
 40 |       <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
 41 | 	<ul class="current">
 42 | 	  
 43 |           
 44 |             <li class="toctree-l1">
 45 | 		
 46 |     <a class="" href="./.">Test Tube: Easily log and tune Deep Learning experiments</a>
 47 | 	    </li>
 48 |           
 49 |             <li class="toctree-l1">
 50 | 		
 51 |     <span class="caption-text">Experiment tracking</span>
 52 |     <ul class="subnav">
 53 |                 <li class="">
 54 |                     
 55 |     <a class="" href="./experiment_tracking/experiment/">Experiment class API</a>
 56 |                 </li>
 57 |     </ul>
 58 | 	    </li>
 59 |           
 60 |             <li class="toctree-l1">
 61 | 		
 62 |     <span class="caption-text">Hpc</span>
 63 |     <ul class="subnav">
 64 |                 <li class="">
 65 |                     
 66 |     <a class="" href="./hpc/SlurmCluster/">SlurmCluster class API</a>
 67 |                 </li>
 68 |     </ul>
 69 | 	    </li>
 70 |           
 71 |             <li class="toctree-l1">
 72 | 		
 73 |     <span class="caption-text">Hyperparameter optimization</span>
 74 |     <ul class="subnav">
 75 |                 <li class="">
 76 |                     
 77 |     <a class="" href="./hyperparameter_optimization/HyperOptArgumentParser/">HyperOptArgumentParser class API</a>
 78 |                 </li>
 79 |     </ul>
 80 | 	    </li>
 81 |           
 82 |         </ul>
 83 |       </div>
 84 |       &nbsp;
 85 |     </nav>
 86 | 
 87 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
 88 | 
 89 |       
 90 |       <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
 91 |         <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
 92 |         <a href="./.">Test tube Documentation</a>
 93 |       </nav>
 94 | 
 95 |       
 96 |       <div class="wy-nav-content">
 97 |         <div class="rst-content">
 98 |           <div role="navigation" aria-label="breadcrumbs navigation">
 99 |   <ul class="wy-breadcrumbs">
100 |     <li><a href="./.">Docs</a> &raquo;</li>
101 |     
102 |     
103 |     <li class="wy-breadcrumbs-aside">
104 |       
105 |     </li>
106 |   </ul>
107 |   <hr/>
108 | </div>
109 |           <div role="main">
110 |             <div class="section">
111 |               
112 | 
113 |   <h1 id="search">Search Results</h1>
114 | 
115 |   <form id="content_search" action="search.html">
116 |     <span role="status" aria-live="polite" class="ui-helper-hidden-accessible"></span>
117 |     <input name="q" id="mkdocs-search-query" type="text" class="search_input search-query ui-autocomplete-input" placeholder="Search the Docs" autocomplete="off" autofocus title="Type search term here">
118 |   </form>
119 | 
120 |   <div id="mkdocs-search-results" class="search-results">
121 |     Searching...
122 |   </div>
123 | 
124 | 
125 |             </div>
126 |           </div>
127 |           <footer>
128 |   
129 | 
130 |   <hr/>
131 | 
132 |   <div role="contentinfo">
133 |     <!-- Copyright etc -->
134 |     
135 |   </div>
136 | 
137 |   Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
138 | </footer>
139 |       
140 |         </div>
141 |       </div>
142 | 
143 |     </section>
144 | 
145 |   </div>
146 | 
147 |   <div class="rst-versions" role="note" style="cursor: pointer">
148 |     <span class="rst-current-version" data-toggle="rst-current-version">
149 |       
150 |           <a href="https://github.com/williamFalcon/test_tube/" class="fa fa-github" style="float: left; color: #fcfcfc"> GitHub</a>
151 |       
152 |       
153 |       
154 |     </span>
155 | </div>
156 |     <script>var base_url = '.';</script>
157 |     <script src="./js/theme.js" defer></script>
158 |       <script src="./search/main.js" defer></script>
159 | 
160 | </body>
161 | </html>
162 | 


--------------------------------------------------------------------------------
/docs/hyperparameter_optimization/HyperOptArgumentParser.md:
--------------------------------------------------------------------------------
  1 | # HyperOptArgumentParser class API   
  2 | 
  3 | [[Github Code](https://github.com/williamFalcon/test-tube/blob/master/test_tube/argparse_hopt.py)]    
  4 | 
  5 | The HyperOptArgumentParser is a subclass of python's
  6 | [argparse](https://docs.python.org/3/library/argparse.html), with added
  7 | finctionality to change parameters on the fly as determined by a
  8 | sampling strategy.
  9 | 
 10 | You can instantiate an `HyperOptArgumentParser` via:
 11 | 
 12 | ``` {.python}
 13 | from test_tube import HyperOptArgumentParser
 14 | 
 15 | # subclass of argparse
 16 | parser = HyperOptArgumentParser(strategy='random_search')
 17 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
 18 | 
 19 | # let's enable optimizing over the number of layers in the network
 20 | parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
 21 | 
 22 | # and tune the number of units in each layer
 23 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=10)
 24 | 
 25 | # compile (because it's argparse underneath)
 26 | hparams = parser.parse_args()
 27 | 
 28 | # run 20 trials of random search over the hyperparams
 29 | for hparam_trial in hparams.trials(20):
 30 |     train_network(hparam_trial)
 31 | ```
 32 | 
 33 | ------------------------------------------------------------------------
 34 | 
 35 | ## init options
 36 | 
 37 | ### `strategy`
 38 | 
 39 | Use either [random
 40 | search](http://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf)
 41 | or [grid
 42 | search](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
 43 | for tuning:
 44 | 
 45 | ``` {.python}
 46 | parser = HyperOptArgumentParser(strategy='grid_search')
 47 | ```
 48 | 
 49 | ------------------------------------------------------------------------
 50 | 
 51 | ## Methods
 52 | 
 53 | All the functionality from argparse works but we've added the following
 54 | functionality:
 55 | 
 56 | ### `opt_list`
 57 | 
 58 | ``` {.python}
 59 | parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
 60 | ```
 61 | 
 62 | Enables searching over a list of values for this parameter. The tunable
 63 | values ONLY replace the argparse values when running a hyperparameter
 64 | optimization search. This is on purpose so your code doesn't have to
 65 | change when you want to tune it.
 66 | 
 67 | **Example**
 68 | 
 69 | ``` {.python}
 70 | parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
 71 | hparams = parser.parse_args()
 72 | # hparams.nb_layers = 2
 73 | 
 74 | for trial in hparams.trials(2):
 75 |     # trial.nb_layers is now a value in [2, 4, 8]
 76 |     # but hparams.nb_layers is still 2
 77 | ```
 78 | 
 79 | ### `opt_range`
 80 | 
 81 | ``` {.python}
 82 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=8, log_base=None)
 83 | ```
 84 | 
 85 | Enables searching over a range of values chosen randomly using the
 86 | `nb_samples` given. The tunable values *only* replace the argparse
 87 | values when running a hyperparameter optimization search. This is on
 88 | purpose so your code doesn't have to change when you want to tune it.
 89 | 
 90 | If `log_base` is set to a positive number, it will randomly search over
 91 | a log scale, where the log base is `log_base`. This is better for search
 92 | over several orders of magnitude efficiently.
 93 | 
 94 | **Example**
 95 | 
 96 | ``` {.python}
 97 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=8)
 98 | hparams = parser.parse_args()
 99 | # hparams.neurons = 50
100 | 
101 | for trial in hparams.trials(2):
102 |     # trial.nb_layers is now a value in [100, 200, 300, 400, 500, 600 700, 800]
103 |     # but hparams.neurons is still 50
104 | ```
105 | 
106 | ### `json_config`
107 | 
108 | ``` {.python}
109 | parser.json_config('--config', default='example.json')
110 | ```
111 | 
112 | Replaces default values in the parser with those read from the json file
113 | 
114 | **Example**
115 | 
116 | *example.json*
117 | 
118 | ``` {.json}
119 | {
120 |     "learning_rate": 200
121 | }
122 | ```
123 | 
124 | ``` {.python}
125 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
126 | parser.json_config('--config', default='example.json')
127 | hparams = parser.parse_args()
128 | 
129 | # hparams.learning_rate = 200
130 | ```
131 | 
132 | ### trials
133 | 
134 | ``` {.python}
135 | trial_generator = hparams.trials(2)
136 | ```
137 | 
138 | Computes the trials needed for these experiments and serves them via a
139 | generator
140 | 
141 | **Example**
142 | 
143 | ``` {.python}
144 | hparams = parser.parse_args()
145 | for trial_hparams in hparams.trials(2):
146 |     # trial_hparams now has values sampled from the training routine
147 | ```
148 | 
149 | ### `optimize_parallel_gpu`
150 | 
151 | ``` {.python}
152 | hparams = parser.parse_args()
153 | hparams.optimize_parallel_gpu(function_to_optimize, gpu_ids=['1', '0, 2'])
154 | ```
155 | 
156 | Parallelize the trials across `nb_workers` processes. Auto assign the
157 | correct gpus. Argument passed into the `function_to_optimize` is the
158 | `trial_params` argument and the gpu_ids.
159 | 
160 | **Example**
161 | 
162 | ``` {.python}
163 | # parallelize tuning on 2 gpus
164 | # this will place each trial in n into a given gpu
165 | def train_main(trial_params, gpu_ids):
166 |     # train your model, etc here...
167 | 
168 | hparams = parser.parse_args()
169 | hparams.optimize_parallel_gpu(train_main, gpu_ids=['1', '0, 2'])
170 | 
171 | # at the end of the optimize_parallel function, all 20 trials will be completed
172 | # in this case by running 10 sets of 2 trials in parallel
173 | ```
174 | 
175 | ### `optimize_parallel_cpu`
176 | 
177 | ``` {.python}
178 | hparams = parser.parse_args()
179 | hparams.optimize_parallel_cpu(function_to_optimize, nb_trials=20, nb_workers=2)
180 | ```
181 | 
182 | Parallelize the trials across `nb_workers` cpus. Argument passed into
183 | the `function_to_optimize` is the `trial_params` argument.
184 | 
185 | **Example**
186 | 
187 | ``` {.python}
188 | # parallelize tuning on 2 cpus
189 | # this will place each trial in n into a given gpu
190 | def train_main(trial_params):
191 |     # train your model, etc here...
192 | 
193 | hparams = parser.parse_args()
194 | hparams.optimize_parallel_cpu(train_main, nb_trials=20, nb_workers=2)
195 | 
196 | # at the end of the optimize_parallel function, all 20 trials will be completed
197 | # in this case by running 10 sets of 2 trials in parallel
198 | ```
199 | 


--------------------------------------------------------------------------------
/test_tube/hyperopt.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import json
  3 | import random
  4 | 
  5 | 
  6 | class HyperParamOptimizer(object):
  7 | 
  8 |     def __init__(self, method='grid_search', enabled=True, experiment=None):
  9 |         """
 10 |         :param method: 'grid_search', 'random_search'
 11 |         :param enabled:
 12 |         """
 13 |         self.method = method
 14 |         self.enabled = enabled
 15 |         self.experiment = experiment
 16 |         self.seen_params = {}
 17 |         self.current_iteration = 0
 18 | 
 19 |         # the params to use at each trial
 20 |         self.trials = None
 21 | 
 22 |         # total iterations we're doing
 23 |         self.nb_iterations = None
 24 | 
 25 |         # details about each param
 26 |         self.params = []
 27 | 
 28 |     # -----------------------------
 29 |     # PARAMETER CHOICES
 30 |     # -----------------------------
 31 |     def tune_uniform(self, low, high, samples, default, name):
 32 |         # how this fx samples for the data
 33 |         def gen_samples():
 34 |             vals = [random.uniform(low, high) for i in range(samples)]
 35 |             return vals
 36 | 
 37 |         return self.__resolve_param(gen_samples, default, name)
 38 | 
 39 |     def tune_odds(self, low, high, default, name):
 40 |         start = low if low %2 != 0 else low + 1
 41 |         def gen_samples():
 42 |             return range(start, high+1, 2)
 43 | 
 44 |         return self.__resolve_param(gen_samples, default, name)
 45 | 
 46 |     def tune_evens(self, low, high, default, name):
 47 |         start = low if low %2 == 0 else low + 1
 48 |         def gen_samples():
 49 |             return range(start, high+1, 2)
 50 | 
 51 |         return self.__resolve_param(gen_samples, default, name)
 52 | 
 53 |     def tune_choice(self, options, default, name):
 54 |         def gen_samples():
 55 |             return options
 56 | 
 57 |         return self.__resolve_param(gen_samples, default, name)
 58 | 
 59 |     def __resolve_param(self, gen_fx, default, name):
 60 |         # case when no action was requested
 61 |         if not self.enabled:
 62 |             return default
 63 | 
 64 |         # create the param when it's new
 65 |         # return the first value in this case
 66 |         if name not in self.seen_params:
 67 |             vals = gen_fx()
 68 |             param = {'vals': vals, 'name': name}
 69 |             self.seen_params[name] = {'idx': len(self.params)}
 70 |             self.params.append(param)
 71 |             return vals[0]
 72 | 
 73 |         # not the first iteration so return the ith element
 74 |         # in the possible values
 75 |         iteration_params = self.trials[self.current_iteration]
 76 |         param_i = self.seen_params[name]['idx']
 77 |         param = iteration_params[param_i]
 78 |         return param['val']
 79 | 
 80 |     # -----------------------------
 81 |     # OPTIMIZATION
 82 |     # -----------------------------
 83 |     def optimize(self, fx, nb_iterations=None):
 84 |         """
 85 |         Primary entry point into the optimization
 86 |         :param fx:
 87 |         :param nb_iterations:
 88 |         :return:
 89 |         """
 90 |         self.nb_iterations = nb_iterations
 91 | 
 92 |         # run first iteration
 93 |         result = fx(self)
 94 | 
 95 |         # log if requested
 96 |         if self.experiment is not None:
 97 |             result['hypo_iter_nb'] = self.current_iteration
 98 |             self.experiment.log(result)
 99 | 
100 |         self.current_iteration += 1
101 | 
102 |         # generate the rest of the training seq
103 |         # we couldn't do this before because we don't know
104 |         # how many params the user needed
105 |         self.__generate_trials()
106 | 
107 |         # run trials for the rest of the iterations
108 |         # we either know the iterations or they're
109 |         # calculated from the strategy used
110 |         for i in range(1, len(self.trials)):
111 |             result = fx(self)
112 |             result['hypo_iter_nb'] = self.current_iteration
113 | 
114 |             # log if requested
115 |             if self.experiment is not None:
116 |                 self.experiment.log(result)
117 | 
118 |             self.current_iteration += 1
119 | 
120 |     # -----------------------------
121 |     # INTERFACE WITH LOGGER
122 |     # -----------------------------
123 |     def get_current_trial_meta(self):
124 |         meta_results = []
125 | 
126 |         # when we have trials, means we've already done 1 run
127 |         # we can just get the params that are about to be run
128 |         # otherwise we need to infer params from the current param list
129 |         # this assumes the user feeds the opt into the experiment after
130 |         # they're done setting up the params
131 |         is_first_trial = self.trials is not None and len(self.trials) > 0
132 |         if is_first_trial:
133 |             trial_params = self.trials[self.current_iteration]
134 |             for trial_param in trial_params:
135 |                 root_param = self.params[trial_param['idx']]
136 |                 meta_results.append({'hypo_' + root_param['name']: trial_param['val']})
137 | 
138 |         # if we haven't done a pass through the data yet,
139 |         # we need to infer from the params in the list
140 |         else:
141 |             for param in self.params:
142 |                 meta_results.append({'hypo_' + param['name']: param['vals'][0]})
143 | 
144 |         # add shared meta
145 |         meta_results.append({'hypo_iter_nb': self.current_iteration})
146 |         return meta_results
147 | 
148 |     # -----------------------------
149 |     # TRIALS HELPER
150 |     # -----------------------------
151 |     def __generate_trials(self):
152 |         """
153 |         Generates the parameter combinations for each requested trial
154 |         :return:
155 |         """
156 |         flat_params = self.__flatten_params(self.params)
157 | 
158 |         # permute for grid search
159 |         if self.method == 'grid_search':
160 |             self.trials = list(itertools.product(*flat_params))
161 | 
162 |             if self.nb_iterations is not None:
163 |                 self.trials = self.trials[0: self.nb_iterations]
164 | 
165 |         if self.method == 'random_search':
166 |             self.trials = self.__generate_random_search_trials(flat_params)
167 | 
168 |     def __flatten_params(self, params):
169 |         """
170 |         Turns a list of parameters with values into a flat tuple list of lists
171 |         so we can permute
172 |         :param params:
173 |         :return:
174 |         """
175 |         flat_params = []
176 |         for i, param in enumerate(params):
177 |             param_groups = []
178 |             for val in param['vals']:
179 |                 param_groups.append({'idx': i, 'val': val})
180 |             flat_params.append(param_groups)
181 |         return flat_params
182 | 
183 |     def __generate_random_search_trials(self, params):
184 |         results = []
185 | 
186 |         # ensures we have unique results
187 |         seen_trials = set()
188 | 
189 |         # shuffle each param list
190 |         potential_trials = 1
191 |         for p in params:
192 |             random.shuffle(p)
193 |             potential_trials *= len(p)
194 | 
195 |         # we can't sample more trials than are possible
196 |         max_iters = min(potential_trials, self.nb_iterations)
197 | 
198 |         # then for the nb of trials requested, create a new param tuple
199 |         # by picking a random integer at each param level
200 |         while len(results) < max_iters:
201 |             trial = []
202 |             for param in params:
203 |                 p = random.sample(param, 1)[0]
204 |                 trial.append(p)
205 | 
206 |             # verify this is a unique trial so we
207 |             # don't duplicate work
208 |             trial_str = json.dumps(trial)
209 |             if trial_str not in seen_trials:
210 |                 seen_trials.add(trial_str)
211 |                 results.append(trial)
212 | 
213 |         return results
214 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <a href="https://williamfalcon.github.io/test-tube/">
  3 |     <img alt="react-router" src="https://raw.githubusercontent.com/williamfalcon/test-tube/master/imgs/test_tube_logo.png" width="50">
  4 |   </a>
  5 | </p>
  6 | <h3 align="center">
  7 |   Test Tube
  8 | </h3>
  9 | <p align="center">
 10 |   Log, organize and parallelize hyperparameter search for Deep Learning experiments
 11 | </p>
 12 | <p align="center">
 13 |   <a href="https://badge.fury.io/py/test-tube"><img src="https://badge.fury.io/py/test-tube.svg" alt="PyPI version" height="18"></a>
 14 |   <a href="https://travis-ci.org/williamFalcon/test-tube"><img src="https://travis-ci.org/williamFalcon/test-tube.svg?branch=master"></a>
 15 |   <a href="https://williamfalcon.github.io/test-tube/"><img src="https://readthedocs.org/projects/test-tube/badge/?version=latest"></a>
 16 |   <a href="https://github.com/williamFalcon/test-tube/blob/master/LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg"></a>
 17 | </p>   
 18 | 
 19 | ## Docs
 20 | 
 21 | **[View the docs here](https://williamfalcon.github.io/test-tube/)**
 22 | 
 23 | ---   
 24 | 
 25 | Test tube is a python library to track and parallelize hyperparameter
 26 | search for Deep Learning and ML experiments. It's framework agnostic and
 27 | built on top of the python argparse API for ease of use.
 28 | 
 29 | ``` {.bash}
 30 | pip install test_tube
 31 | ```
 32 | 
 33 | ---   
 34 | 
 35 | ### Main test-tube uses
 36 | 
 37 | -   [Parallelize hyperparameter
 38 |     optimization](https://williamfalcon.github.io/test-tube/hyperparameter_optimization/HyperOptArgumentParser/)
 39 |     (across multiple gpus or cpus).
 40 | -   [Parallelize hyperparameter
 41 |     optimization](https://williamfalcon.github.io/test-tube/hyperparameter_optimization/HyperOptArgumentParser/)
 42 |     across HPC cluster using SLURM.   
 43 | -   Log experiment hyperparameters and experiment data.   
 44 |     [Experiments](https://williamfalcon.github.io/test-tube/experiment_tracking/experiment/)
 45 |     across models.
 46 | -   Visualize with [tensorboard](https://www.tensorflow.org/guide/summaries_and_tensorboard)
 47 | 
 48 | Compatible with Python any Python ML library like Tensorflow, Keras, Pytorch, Caffe, Caffe2, Chainer, MXNet, Theano, Scikit-learn   
 49 | 
 50 | ---   
 51 | ### Examples   
 52 | The Experiment object is a subclass of Pytorch.SummaryWriter.  
 53 | 
 54 | **Log and visualize with Tensorboard**     
 55 | 
 56 | ```{.python}
 57 | from test-tube import Experiment
 58 | import torch
 59 | 
 60 | exp = Experiment('/some/path')
 61 | exp.tag({'learning_rate': 0.02, 'layers': 4})    
 62 | 
 63 | # exp is superclass of SummaryWriter
 64 | features = torch.Tensor(100, 784)
 65 | writer.add_embedding(features, metadata=label, label_img=images.unsqueeze(1))
 66 | 
 67 | # simulate training
 68 | for n_iter in range(2000):
 69 |     e.log({'testtt': n_iter * np.sin(n_iter)})
 70 | 
 71 | # save and close
 72 | exp.save()
 73 | exp.close()
 74 | ```
 75 | 
 76 | ```{.bash}
 77 | pip install tensorflow   
 78 | 
 79 | tensorboard --logdir /some/path
 80 | ``` 
 81 |     
 82 | **Run grid search on SLURM GPU cluster**    
 83 | 
 84 | ``` {.python}   
 85 | from test_tube.hpc import SlurmCluster
 86 | 
 87 | # hyperparameters is a test-tube hyper params object
 88 | hyperparams = args.parse()
 89 | 
 90 | # init cluster
 91 | cluster = SlurmCluster(
 92 |     hyperparam_optimizer=hyperparams,
 93 |     log_path='/path/to/log/results/to',
 94 |     python_cmd='python3'
 95 | )
 96 | 
 97 | # let the cluster know where to email for a change in job status (ie: complete, fail, etc...)
 98 | cluster.notify_job_status(email='some@email.com', on_done=True, on_fail=True)
 99 | 
100 | # set the job options. In this instance, we'll run 20 different models
101 | # each with its own set of hyperparameters giving each one 1 GPU (ie: taking up 20 GPUs)
102 | cluster.per_experiment_nb_gpus = 1
103 | cluster.per_experiment_nb_nodes = 1
104 | 
105 | # run the models on the cluster
106 | cluster.optimize_parallel_cluster_gpu(train, nb_trials=20, job_name='first_tt_batch', job_display_name='my_batch')   
107 | 
108 | # we just ran 20 different hyperparameters on 20 GPUs in the HPC cluster!!    
109 | ```    
110 | 
111 | **Optimize hyperparameters across GPUs**
112 | 
113 | ``` {.python}
114 | from test_tube import HyperOptArgumentParser
115 | 
116 | # subclass of argparse
117 | parser = HyperOptArgumentParser(strategy='random_search')
118 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
119 | 
120 | # let's enable optimizing over the number of layers in the network
121 | parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
122 | 
123 | # and tune the number of units in each layer
124 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=10)
125 | 
126 | # compile (because it's argparse underneath)
127 | hparams = parser.parse_args()
128 | 
129 | # optimize across 4 gpus
130 | # use 2 gpus together and the other two separately
131 | hparams.optimize_parallel_gpu(MyModel.fit, gpu_ids=['1', '2,3', '0'], nb_trials=192, nb_workers=4)
132 | ```
133 | 
134 | Or... across CPUs
135 | 
136 | ``` {.python}
137 | hparams.optimize_parallel_cpu(MyModel.fit, nb_trials=192, nb_workers=12)
138 | ```
139 | 
140 | You can also optimize on a *log* scale to allow better search over
141 | magnitudes of hyperparameter values, with a chosen base (disabled by
142 | default). Keep in mind that the range you search over must be strictly
143 | positive.
144 | 
145 | ``` {.python}
146 | from test_tube import HyperOptArgumentParser
147 | 
148 | # subclass of argparse
149 | parser = HyperOptArgumentParser(strategy='random_search')
150 | 
151 | # Randomly searches over the (log-transformed) range [100,800).
152 | 
153 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=10, log_base=10)
154 | 
155 | 
156 | # compile (because it's argparse underneath)
157 | hparams = parser.parse_args()
158 | 
159 | # run 20 trials of random search over the hyperparams
160 | for hparam_trial in hparams.trials(20):
161 |     train_network(hparam_trial)
162 | ```
163 | 
164 | ### Convert your argparse params into searchable params by changing 1 line
165 | 
166 | ``` {.python}
167 | import argparse
168 | from test_tube import HyperOptArgumentParser
169 | 
170 | # these lines are equivalent
171 | parser = argparse.ArgumentParser(description='Process some integers.')
172 | parser = HyperOptArgumentParser(description='Process some integers.', strategy='grid_search')
173 | 
174 | # do normal argparse stuff
175 | ...
176 | ```
177 | 
178 | ### Log images inline with metrics
179 | 
180 | ``` {.python}
181 | # name must have either jpg, png or jpeg in it
182 | img = np.imread('a.jpg')
183 | exp.log('test_jpg': img, 'val_err': 0.2)
184 | 
185 | # saves image to ../exp/version/media/test_0.jpg
186 | # csv has file path to that image in that cell
187 | ```
188 | 
189 | ## Demos
190 | 
191 | -   [Hyperparameter optimization for PyTorch across 20 cluster GPUs](https://github.com/williamFalcon/test-tube/blob/master/examples/pytorch_hpc_example.py)   
192 | -   [Hyperparameter optimization across 20 cluster CPUs](https://github.com/williamFalcon/test-tube/blob/master/examples/hpc_cpu_example.py)   
193 | -   [Experiments and hyperparameter optimization for tensorflow across 4 GPUs simultaneously](https://github.com/williamFalcon/test-tube/blob/master/examples/tensorflow_example.py)
194 | 
195 | ## How to contribute
196 | 
197 | Feel free to fix bugs and make improvements! 1. Check out the [current
198 | bugs here](https://github.com/williamFalcon/test-tube/issues) or
199 | [feature
200 | requests](https://github.com/williamFalcon/test-tube/projects/1). 2. To
201 | work on a bug or feature, head over to our [project
202 | page](https://github.com/williamFalcon/test-tube/projects/1) and assign
203 | yourself the bug. 3. We'll add contributor names periodically as people
204 | improve the library!
205 | 
206 | ## Bibtex
207 | 
208 | To cite the framework use:
209 | 
210 |     @misc{Falcon2017,
211 |       author = {Falcon, W.A.},
212 |       title = {Test Tube},
213 |       year = {2017},
214 |       publisher = {GitHub},
215 |       journal = {GitHub repository},
216 |       howpublished = {\url{https://github.com/williamfalcon/test-tube}}
217 |     }    
218 |     
219 |  ## License    
220 |  In addition to the terms outlined in the license, this software is U.S. Patent Pending.    
221 | 


--------------------------------------------------------------------------------
/site/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
  3 | <!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
  4 | <head>
  5 |   <meta charset="utf-8">
  6 |   <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 |   <meta name="description" content="Documentation for Test Tube, the Python Deep Learning and Machine Learning experiment tracking and tuning framework.">
  9 |   
 10 |   <link rel="shortcut icon" href="img/favicon.ico">
 11 |   <title>Test Tube: Easily log and tune Deep Learning experiments - Test tube Documentation</title>
 12 |   <link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
 13 | 
 14 |   <link rel="stylesheet" href="css/theme.css" type="text/css" />
 15 |   <link rel="stylesheet" href="css/theme_extra.css" type="text/css" />
 16 |   <link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/github.min.css">
 17 |   
 18 |   <script>
 19 |     // Current page data
 20 |     var mkdocs_page_name = "Test Tube: Easily log and tune Deep Learning experiments";
 21 |     var mkdocs_page_input_path = "index.md";
 22 |     var mkdocs_page_url = null;
 23 |   </script>
 24 |   
 25 |   <script src="js/jquery-2.1.1.min.js" defer></script>
 26 |   <script src="js/modernizr-2.8.3.min.js" defer></script>
 27 |   <script src="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/highlight.min.js"></script>
 28 |   <script>hljs.initHighlightingOnLoad();</script> 
 29 |   
 30 | </head>
 31 | 
 32 | <body class="wy-body-for-nav" role="document">
 33 | 
 34 |   <div class="wy-grid-for-nav">
 35 | 
 36 |     
 37 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
 38 |       <div class="wy-side-nav-search">
 39 |         <a href="." class="icon icon-home"> Test tube Documentation</a>
 40 |         <div role="search">
 41 |   <form id ="rtd-search-form" class="wy-form" action="./search.html" method="get">
 42 |     <input type="text" name="q" placeholder="Search docs" title="Type search term here" />
 43 |   </form>
 44 | </div>
 45 |       </div>
 46 | 
 47 |       <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
 48 | 	<ul class="current">
 49 | 	  
 50 |           
 51 |             <li class="toctree-l1 current">
 52 | 		
 53 |     <a class="current" href=".">Test Tube: Easily log and tune Deep Learning experiments</a>
 54 |     <ul class="subnav">
 55 |             
 56 |     <li class="toctree-l2"><a href="#test-tube-easily-log-and-tune-deep-learning-experiments">Test Tube: Easily log and tune Deep Learning experiments</a></li>
 57 |     
 58 |         <ul>
 59 |         
 60 |             <li><a class="toctree-l3" href="#getting-started">Getting started</a></li>
 61 |         
 62 |         </ul>
 63 |     
 64 | 
 65 |     </ul>
 66 | 	    </li>
 67 |           
 68 |             <li class="toctree-l1">
 69 | 		
 70 |     <span class="caption-text">Experiment tracking</span>
 71 |     <ul class="subnav">
 72 |                 <li class="">
 73 |                     
 74 |     <a class="" href="experiment_tracking/experiment/">Experiment class API</a>
 75 |                 </li>
 76 |     </ul>
 77 | 	    </li>
 78 |           
 79 |             <li class="toctree-l1">
 80 | 		
 81 |     <span class="caption-text">Hpc</span>
 82 |     <ul class="subnav">
 83 |                 <li class="">
 84 |                     
 85 |     <a class="" href="hpc/SlurmCluster/">SlurmCluster class API</a>
 86 |                 </li>
 87 |     </ul>
 88 | 	    </li>
 89 |           
 90 |             <li class="toctree-l1">
 91 | 		
 92 |     <span class="caption-text">Hyperparameter optimization</span>
 93 |     <ul class="subnav">
 94 |                 <li class="">
 95 |                     
 96 |     <a class="" href="hyperparameter_optimization/HyperOptArgumentParser/">HyperOptArgumentParser class API</a>
 97 |                 </li>
 98 |     </ul>
 99 | 	    </li>
100 |           
101 |         </ul>
102 |       </div>
103 |       &nbsp;
104 |     </nav>
105 | 
106 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
107 | 
108 |       
109 |       <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
110 |         <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
111 |         <a href=".">Test tube Documentation</a>
112 |       </nav>
113 | 
114 |       
115 |       <div class="wy-nav-content">
116 |         <div class="rst-content">
117 |           <div role="navigation" aria-label="breadcrumbs navigation">
118 |   <ul class="wy-breadcrumbs">
119 |     <li><a href=".">Docs</a> &raquo;</li>
120 |     
121 |       
122 |     
123 |     <li>Test Tube: Easily log and tune Deep Learning experiments</li>
124 |     <li class="wy-breadcrumbs-aside">
125 |       
126 |         <a href="https://github.com/williamFalcon/test_tube/edit/master/docs/index.md"
127 |           class="icon icon-github"> Edit on GitHub</a>
128 |       
129 |     </li>
130 |   </ul>
131 |   <hr/>
132 | </div>
133 |           <div role="main">
134 |             <div class="section">
135 |               
136 |                 <h1 id="test-tube-easily-log-and-tune-deep-learning-experiments">Test Tube: Easily log and tune Deep Learning experiments</h1>
137 | <p>Test Tube allows you to easily log metadata and track your machine
138 | learning experiments.</p>
139 | <p>Use Test Tube if you need to:</p>
140 | <ul>
141 | <li>Track many <a href="experiment_tracking/experiment/">Experiments</a> across
142 |     models.</li>
143 | <li><a href="http://testtube.williamfalcon.com">Visualize</a> and compare different
144 |     experiments without uploading anywhere.</li>
145 | <li><a href="hyperparameter_optimization/HyperOptArgumentParser/">Optimize your
146 |     hyperparameters</a>
147 |     using grid search or random search.</li>
148 | <li>Automatically track ALL parameters for a particular training run.</li>
149 | </ul>
150 | <p>Test Tube is compatible with: Python 2 and 3</p>
151 | <h2 id="getting-started">Getting started</h2>
152 | <hr />
153 | <h3 id="create-an-experiment">Create an <a href="experiment_tracking/experiment/">Experiment</a></h3>
154 | <pre><code class="python">from test_tube import Experiment
155 | 
156 | exp = Experiment(name='dense_model',
157 |                  debug=False,
158 |                  save_dir='/Desktop/test_tube')
159 | 
160 | exp.tag({'learning_rate': 0.002, 'nb_layers': 2})
161 | 
162 | for step in training_steps:
163 |     tng_err = model.eval(tng_x, tng_y)
164 | 
165 |     exp.log('tng_err': tng_err)
166 | 
167 | # training complete!
168 | # all your logs and data are ready to be visualized at testtube.williamfalcon.com
169 | </code></pre>
170 | 
171 | <hr />
172 | <h3 id="optimize-your-hyperparameters">Optimize your <a href="hyperparameter_optimization/HyperOptArgumentParser/">hyperparameters</a></h3>
173 | <pre><code class="python">from test_tube import HyperOptArgumentParser
174 | 
175 | # subclass of argparse
176 | parser = HyperOptArgumentParser(strategy='random_search')
177 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
178 | 
179 | # let's enable optimizing over the number of layers in the network
180 | parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
181 | 
182 | # and tune the number of units in each layer
183 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=10)
184 | 
185 | # compile (because it's argparse underneath)
186 | hparams = parser.parse_args()
187 | 
188 | # run 20 trials of random search over the hyperparams
189 | for hparam_trial in hparams.trials(20):
190 |     train_network(hparam_trial)
191 | </code></pre>
192 | 
193 | <hr />
194 | <h3 id="visualize">Visualize</h3>
195 | <pre><code class="python">import pandas as pd
196 | import matplotlib
197 | 
198 | # each experiment is saved to a metrics.csv file which can be imported anywhere
199 | # images save to exp/version/images
200 | df = pd.read_csv('../some/dir/test_tube_data/dense_model/version_0/metrics.csv')
201 | df.tng_err.plot()
202 | </code></pre>
203 |               
204 |             </div>
205 |           </div>
206 |           <footer>
207 |   
208 |     <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
209 |       
210 |         <a href="experiment_tracking/experiment/" class="btn btn-neutral float-right" title="Experiment class API">Next <span class="icon icon-circle-arrow-right"></span></a>
211 |       
212 |       
213 |     </div>
214 |   
215 | 
216 |   <hr/>
217 | 
218 |   <div role="contentinfo">
219 |     <!-- Copyright etc -->
220 |     
221 |   </div>
222 | 
223 |   Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
224 | </footer>
225 |       
226 |         </div>
227 |       </div>
228 | 
229 |     </section>
230 | 
231 |   </div>
232 | 
233 |   <div class="rst-versions" role="note" style="cursor: pointer">
234 |     <span class="rst-current-version" data-toggle="rst-current-version">
235 |       
236 |           <a href="https://github.com/williamFalcon/test_tube/" class="fa fa-github" style="float: left; color: #fcfcfc"> GitHub</a>
237 |       
238 |       
239 |       
240 |         <span style="margin-left: 15px"><a href="experiment_tracking/experiment/" style="color: #fcfcfc">Next &raquo;</a></span>
241 |       
242 |     </span>
243 | </div>
244 |     <script>var base_url = '.';</script>
245 |     <script src="js/theme.js" defer></script>
246 |       <script src="search/main.js" defer></script>
247 | 
248 | </body>
249 | </html>
250 | 
251 | <!--
252 | MkDocs version : 1.0.4
253 | Build Date UTC : 2019-08-03 13:42:44
254 | -->
255 | 


--------------------------------------------------------------------------------
/site/js/modernizr-2.8.3.min.js:
--------------------------------------------------------------------------------
1 | window.Modernizr=function(e,t,n){function r(e){b.cssText=e}function o(e,t){return r(S.join(e+";")+(t||""))}function a(e,t){return typeof e===t}function i(e,t){return!!~(""+e).indexOf(t)}function c(e,t){for(var r in e){var o=e[r];if(!i(o,"-")&&b[o]!==n)return"pfx"==t?o:!0}return!1}function s(e,t,r){for(var o in e){var i=t[e[o]];if(i!==n)return r===!1?e[o]:a(i,"function")?i.bind(r||t):i}return!1}function u(e,t,n){var r=e.charAt(0).toUpperCase()+e.slice(1),o=(e+" "+k.join(r+" ")+r).split(" ");return a(t,"string")||a(t,"undefined")?c(o,t):(o=(e+" "+T.join(r+" ")+r).split(" "),s(o,t,n))}function l(){p.input=function(n){for(var r=0,o=n.length;o>r;r++)j[n[r]]=!!(n[r]in E);return j.list&&(j.list=!(!t.createElement("datalist")||!e.HTMLDataListElement)),j}("autocomplete autofocus list placeholder max min multiple pattern required step".split(" ")),p.inputtypes=function(e){for(var r,o,a,i=0,c=e.length;c>i;i++)E.setAttribute("type",o=e[i]),r="text"!==E.type,r&&(E.value=x,E.style.cssText="position:absolute;visibility:hidden;",/^range$/.test(o)&&E.style.WebkitAppearance!==n?(g.appendChild(E),a=t.defaultView,r=a.getComputedStyle&&"textfield"!==a.getComputedStyle(E,null).WebkitAppearance&&0!==E.offsetHeight,g.removeChild(E)):/^(search|tel)$/.test(o)||(r=/^(url|email)$/.test(o)?E.checkValidity&&E.checkValidity()===!1:E.value!=x)),P[e[i]]=!!r;return P}("search tel url email datetime date month week time datetime-local number range color".split(" "))}var d,f,m="2.8.3",p={},h=!0,g=t.documentElement,v="modernizr",y=t.createElement(v),b=y.style,E=t.createElement("input"),x=":)",w={}.toString,S=" -webkit- -moz- -o- -ms- ".split(" "),C="Webkit Moz O ms",k=C.split(" "),T=C.toLowerCase().split(" "),N={svg:"http://www.w3.org/2000/svg"},M={},P={},j={},$=[],D=$.slice,F=function(e,n,r,o){var a,i,c,s,u=t.createElement("div"),l=t.body,d=l||t.createElement("body");if(parseInt(r,10))for(;r--;)c=t.createElement("div"),c.id=o?o[r]:v+(r+1),u.appendChild(c);return a=["&#173;",'<style id="s',v,'">',e,"</style>"].join(""),u.id=v,(l?u:d).innerHTML+=a,d.appendChild(u),l||(d.style.background="",d.style.overflow="hidden",s=g.style.overflow,g.style.overflow="hidden",g.appendChild(d)),i=n(u,e),l?u.parentNode.removeChild(u):(d.parentNode.removeChild(d),g.style.overflow=s),!!i},z=function(t){var n=e.matchMedia||e.msMatchMedia;if(n)return n(t)&&n(t).matches||!1;var r;return F("@media "+t+" { #"+v+" { position: absolute; } }",function(t){r="absolute"==(e.getComputedStyle?getComputedStyle(t,null):t.currentStyle).position}),r},A=function(){function e(e,o){o=o||t.createElement(r[e]||"div"),e="on"+e;var i=e in o;return i||(o.setAttribute||(o=t.createElement("div")),o.setAttribute&&o.removeAttribute&&(o.setAttribute(e,""),i=a(o[e],"function"),a(o[e],"undefined")||(o[e]=n),o.removeAttribute(e))),o=null,i}var r={select:"input",change:"input",submit:"form",reset:"form",error:"img",load:"img",abort:"img"};return e}(),L={}.hasOwnProperty;f=a(L,"undefined")||a(L.call,"undefined")?function(e,t){return t in e&&a(e.constructor.prototype[t],"undefined")}:function(e,t){return L.call(e,t)},Function.prototype.bind||(Function.prototype.bind=function(e){var t=this;if("function"!=typeof t)throw new TypeError;var n=D.call(arguments,1),r=function(){if(this instanceof r){var o=function(){};o.prototype=t.prototype;var a=new o,i=t.apply(a,n.concat(D.call(arguments)));return Object(i)===i?i:a}return t.apply(e,n.concat(D.call(arguments)))};return r}),M.flexbox=function(){return u("flexWrap")},M.flexboxlegacy=function(){return u("boxDirection")},M.canvas=function(){var e=t.createElement("canvas");return!(!e.getContext||!e.getContext("2d"))},M.canvastext=function(){return!(!p.canvas||!a(t.createElement("canvas").getContext("2d").fillText,"function"))},M.webgl=function(){return!!e.WebGLRenderingContext},M.touch=function(){var n;return"ontouchstart"in e||e.DocumentTouch&&t instanceof DocumentTouch?n=!0:F(["@media (",S.join("touch-enabled),("),v,")","{#modernizr{top:9px;position:absolute}}"].join(""),function(e){n=9===e.offsetTop}),n},M.geolocation=function(){return"geolocation"in navigator},M.postmessage=function(){return!!e.postMessage},M.websqldatabase=function(){return!!e.openDatabase},M.indexedDB=function(){return!!u("indexedDB",e)},M.hashchange=function(){return A("hashchange",e)&&(t.documentMode===n||t.documentMode>7)},M.history=function(){return!(!e.history||!history.pushState)},M.draganddrop=function(){var e=t.createElement("div");return"draggable"in e||"ondragstart"in e&&"ondrop"in e},M.websockets=function(){return"WebSocket"in e||"MozWebSocket"in e},M.rgba=function(){return r("background-color:rgba(150,255,150,.5)"),i(b.backgroundColor,"rgba")},M.hsla=function(){return r("background-color:hsla(120,40%,100%,.5)"),i(b.backgroundColor,"rgba")||i(b.backgroundColor,"hsla")},M.multiplebgs=function(){return r("background:url(https://),url(https://),red url(https://)"),/(url\s*\(.*?){3}/.test(b.background)},M.backgroundsize=function(){return u("backgroundSize")},M.borderimage=function(){return u("borderImage")},M.borderradius=function(){return u("borderRadius")},M.boxshadow=function(){return u("boxShadow")},M.textshadow=function(){return""===t.createElement("div").style.textShadow},M.opacity=function(){return o("opacity:.55"),/^0.55$/.test(b.opacity)},M.cssanimations=function(){return u("animationName")},M.csscolumns=function(){return u("columnCount")},M.cssgradients=function(){var e="background-image:",t="gradient(linear,left top,right bottom,from(#9f9),to(white));",n="linear-gradient(left top,#9f9, white);";return r((e+"-webkit- ".split(" ").join(t+e)+S.join(n+e)).slice(0,-e.length)),i(b.backgroundImage,"gradient")},M.cssreflections=function(){return u("boxReflect")},M.csstransforms=function(){return!!u("transform")},M.csstransforms3d=function(){var e=!!u("perspective");return e&&"webkitPerspective"in g.style&&F("@media (transform-3d),(-webkit-transform-3d){#modernizr{left:9px;position:absolute;height:3px;}}",function(t){e=9===t.offsetLeft&&3===t.offsetHeight}),e},M.csstransitions=function(){return u("transition")},M.fontface=function(){var e;return F('@font-face {font-family:"font";src:url("https://")}',function(n,r){var o=t.getElementById("smodernizr"),a=o.sheet||o.styleSheet,i=a?a.cssRules&&a.cssRules[0]?a.cssRules[0].cssText:a.cssText||"":"";e=/src/i.test(i)&&0===i.indexOf(r.split(" ")[0])}),e},M.generatedcontent=function(){var e;return F(["#",v,"{font:0/0 a}#",v,':after{content:"',x,'";visibility:hidden;font:3px/1 a}'].join(""),function(t){e=t.offsetHeight>=3}),e},M.video=function(){var e=t.createElement("video"),n=!1;try{(n=!!e.canPlayType)&&(n=new Boolean(n),n.ogg=e.canPlayType('video/ogg; codecs="theora"').replace(/^no$/,""),n.h264=e.canPlayType('video/mp4; codecs="avc1.42E01E"').replace(/^no$/,""),n.webm=e.canPlayType('video/webm; codecs="vp8, vorbis"').replace(/^no$/,""))}catch(r){}return n},M.audio=function(){var e=t.createElement("audio"),n=!1;try{(n=!!e.canPlayType)&&(n=new Boolean(n),n.ogg=e.canPlayType('audio/ogg; codecs="vorbis"').replace(/^no$/,""),n.mp3=e.canPlayType("audio/mpeg;").replace(/^no$/,""),n.wav=e.canPlayType('audio/wav; codecs="1"').replace(/^no$/,""),n.m4a=(e.canPlayType("audio/x-m4a;")||e.canPlayType("audio/aac;")).replace(/^no$/,""))}catch(r){}return n},M.localstorage=function(){try{return localStorage.setItem(v,v),localStorage.removeItem(v),!0}catch(e){return!1}},M.sessionstorage=function(){try{return sessionStorage.setItem(v,v),sessionStorage.removeItem(v),!0}catch(e){return!1}},M.webworkers=function(){return!!e.Worker},M.applicationcache=function(){return!!e.applicationCache},M.svg=function(){return!!t.createElementNS&&!!t.createElementNS(N.svg,"svg").createSVGRect},M.inlinesvg=function(){var e=t.createElement("div");return e.innerHTML="<svg/>",(e.firstChild&&e.firstChild.namespaceURI)==N.svg},M.smil=function(){return!!t.createElementNS&&/SVGAnimate/.test(w.call(t.createElementNS(N.svg,"animate")))},M.svgclippaths=function(){return!!t.createElementNS&&/SVGClipPath/.test(w.call(t.createElementNS(N.svg,"clipPath")))};for(var H in M)f(M,H)&&(d=H.toLowerCase(),p[d]=M[H](),$.push((p[d]?"":"no-")+d));return p.input||l(),p.addTest=function(e,t){if("object"==typeof e)for(var r in e)f(e,r)&&p.addTest(r,e[r]);else{if(e=e.toLowerCase(),p[e]!==n)return p;t="function"==typeof t?t():t,"undefined"!=typeof h&&h&&(g.className+=" "+(t?"":"no-")+e),p[e]=t}return p},r(""),y=E=null,function(e,t){function n(e,t){var n=e.createElement("p"),r=e.getElementsByTagName("head")[0]||e.documentElement;return n.innerHTML="x<style>"+t+"</style>",r.insertBefore(n.lastChild,r.firstChild)}function r(){var e=y.elements;return"string"==typeof e?e.split(" "):e}function o(e){var t=v[e[h]];return t||(t={},g++,e[h]=g,v[g]=t),t}function a(e,n,r){if(n||(n=t),l)return n.createElement(e);r||(r=o(n));var a;return a=r.cache[e]?r.cache[e].cloneNode():p.test(e)?(r.cache[e]=r.createElem(e)).cloneNode():r.createElem(e),!a.canHaveChildren||m.test(e)||a.tagUrn?a:r.frag.appendChild(a)}function i(e,n){if(e||(e=t),l)return e.createDocumentFragment();n=n||o(e);for(var a=n.frag.cloneNode(),i=0,c=r(),s=c.length;s>i;i++)a.createElement(c[i]);return a}function c(e,t){t.cache||(t.cache={},t.createElem=e.createElement,t.createFrag=e.createDocumentFragment,t.frag=t.createFrag()),e.createElement=function(n){return y.shivMethods?a(n,e,t):t.createElem(n)},e.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+r().join().replace(/[\w\-]+/g,function(e){return t.createElem(e),t.frag.createElement(e),'c("'+e+'")'})+");return n}")(y,t.frag)}function s(e){e||(e=t);var r=o(e);return!y.shivCSS||u||r.hasCSS||(r.hasCSS=!!n(e,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),l||c(e,r),e}var u,l,d="3.7.0",f=e.html5||{},m=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,p=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,h="_html5shiv",g=0,v={};!function(){try{var e=t.createElement("a");e.innerHTML="<xyz></xyz>",u="hidden"in e,l=1==e.childNodes.length||function(){t.createElement("a");var e=t.createDocumentFragment();return"undefined"==typeof e.cloneNode||"undefined"==typeof e.createDocumentFragment||"undefined"==typeof e.createElement}()}catch(n){u=!0,l=!0}}();var y={elements:f.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output progress section summary template time video",version:d,shivCSS:f.shivCSS!==!1,supportsUnknownElements:l,shivMethods:f.shivMethods!==!1,type:"default",shivDocument:s,createElement:a,createDocumentFragment:i};e.html5=y,s(t)}(this,t),p._version=m,p._prefixes=S,p._domPrefixes=T,p._cssomPrefixes=k,p.mq=z,p.hasEvent=A,p.testProp=function(e){return c([e])},p.testAllProps=u,p.testStyles=F,p.prefixed=function(e,t,n){return t?u(e,t,n):u(e,"pfx")},g.className=g.className.replace(/(^|\s)no-js(\s|$)/,"$1$2")+(h?" js "+$.join(" "):""),p}(this,this.document);


--------------------------------------------------------------------------------
/site/experiment_tracking/experiment/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
  3 | <!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
  4 | <head>
  5 |   <meta charset="utf-8">
  6 |   <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 |   
  9 |   
 10 |   <link rel="shortcut icon" href="../../img/favicon.ico">
 11 |   <title>Experiment class API - Test tube Documentation</title>
 12 |   <link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
 13 | 
 14 |   <link rel="stylesheet" href="../../css/theme.css" type="text/css" />
 15 |   <link rel="stylesheet" href="../../css/theme_extra.css" type="text/css" />
 16 |   <link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/github.min.css">
 17 |   
 18 |   <script>
 19 |     // Current page data
 20 |     var mkdocs_page_name = "Experiment class API";
 21 |     var mkdocs_page_input_path = "experiment_tracking/experiment.md";
 22 |     var mkdocs_page_url = null;
 23 |   </script>
 24 |   
 25 |   <script src="../../js/jquery-2.1.1.min.js" defer></script>
 26 |   <script src="../../js/modernizr-2.8.3.min.js" defer></script>
 27 |   <script src="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/highlight.min.js"></script>
 28 |   <script>hljs.initHighlightingOnLoad();</script> 
 29 |   
 30 | </head>
 31 | 
 32 | <body class="wy-body-for-nav" role="document">
 33 | 
 34 |   <div class="wy-grid-for-nav">
 35 | 
 36 |     
 37 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
 38 |       <div class="wy-side-nav-search">
 39 |         <a href="../.." class="icon icon-home"> Test tube Documentation</a>
 40 |         <div role="search">
 41 |   <form id ="rtd-search-form" class="wy-form" action="../../search.html" method="get">
 42 |     <input type="text" name="q" placeholder="Search docs" title="Type search term here" />
 43 |   </form>
 44 | </div>
 45 |       </div>
 46 | 
 47 |       <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
 48 | 	<ul class="current">
 49 | 	  
 50 |           
 51 |             <li class="toctree-l1">
 52 | 		
 53 |     <a class="" href="../..">Test Tube: Easily log and tune Deep Learning experiments</a>
 54 | 	    </li>
 55 |           
 56 |             <li class="toctree-l1">
 57 | 		
 58 |     <span class="caption-text">Experiment tracking</span>
 59 |     <ul class="subnav">
 60 |                 <li class=" current">
 61 |                     
 62 |     <a class="current" href="./">Experiment class API</a>
 63 |     <ul class="subnav">
 64 |             
 65 |     <li class="toctree-l3"><a href="#experiment-class-api">Experiment class API</a></li>
 66 |     
 67 |         <ul>
 68 |         
 69 |             <li><a class="toctree-l4" href="#init-options">init options</a></li>
 70 |         
 71 |             <li><a class="toctree-l4" href="#methods">Methods</a></li>
 72 |         
 73 |         </ul>
 74 |     
 75 | 
 76 |     </ul>
 77 |                 </li>
 78 |     </ul>
 79 | 	    </li>
 80 |           
 81 |             <li class="toctree-l1">
 82 | 		
 83 |     <span class="caption-text">Hpc</span>
 84 |     <ul class="subnav">
 85 |                 <li class="">
 86 |                     
 87 |     <a class="" href="../../hpc/SlurmCluster/">SlurmCluster class API</a>
 88 |                 </li>
 89 |     </ul>
 90 | 	    </li>
 91 |           
 92 |             <li class="toctree-l1">
 93 | 		
 94 |     <span class="caption-text">Hyperparameter optimization</span>
 95 |     <ul class="subnav">
 96 |                 <li class="">
 97 |                     
 98 |     <a class="" href="../../hyperparameter_optimization/HyperOptArgumentParser/">HyperOptArgumentParser class API</a>
 99 |                 </li>
100 |     </ul>
101 | 	    </li>
102 |           
103 |         </ul>
104 |       </div>
105 |       &nbsp;
106 |     </nav>
107 | 
108 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
109 | 
110 |       
111 |       <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
112 |         <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
113 |         <a href="../..">Test tube Documentation</a>
114 |       </nav>
115 | 
116 |       
117 |       <div class="wy-nav-content">
118 |         <div class="rst-content">
119 |           <div role="navigation" aria-label="breadcrumbs navigation">
120 |   <ul class="wy-breadcrumbs">
121 |     <li><a href="../..">Docs</a> &raquo;</li>
122 |     
123 |       
124 |         
125 |           <li>Experiment tracking &raquo;</li>
126 |         
127 |       
128 |     
129 |     <li>Experiment class API</li>
130 |     <li class="wy-breadcrumbs-aside">
131 |       
132 |         <a href="https://github.com/williamFalcon/test_tube/edit/master/docs/experiment_tracking/experiment.md"
133 |           class="icon icon-github"> Edit on GitHub</a>
134 |       
135 |     </li>
136 |   </ul>
137 |   <hr/>
138 | </div>
139 |           <div role="main">
140 |             <div class="section">
141 |               
142 |                 <h1 id="experiment-class-api">Experiment class API</h1>
143 | <p>[<a href="https://github.com/williamFalcon/test-tube/blob/master/test_tube/log.py">Github Code</a>]    </p>
144 | <p>An Experiment holds metadata and the results of the training run, you
145 | can instantiate an <code>Experiment</code> via:</p>
146 | <pre><code class="python">from test_tube import Experiment
147 | 
148 | exp = Experiment(name='dense_model',
149 |                  debug=False,
150 |                  save_dir='/Desktop/test_tube')
151 | 
152 | exp.tag({'learning_rate': 0.002, 'nb_layers': 2})
153 | 
154 | for step in training_steps:
155 |     tng_err = model.eval(tng_x, tng_y)
156 | 
157 |     exp.log('tng_err': tng_err)
158 | 
159 | # training complete!
160 | # all your logs and data are ready to be visualized at testtube.williamfalcon.com
161 | </code></pre>
162 | 
163 | <hr />
164 | <h2 id="init-options">init options</h2>
165 | <h3 id="version">version</h3>
166 | <p>The same Experiment can have multiple versions. Test tube generates
167 | these automatically each time you run your model. To set your own
168 | version use:</p>
169 | <pre><code class="python">exp = Experiment(name='dense_model',version=1)
170 | </code></pre>
171 | 
172 | <h3 id="debug">debug</h3>
173 | <p>If you're debugging and don't want to create a log file turn debug to
174 | True</p>
175 | <pre><code class="python">exp = Experiment(name='dense_model',debug=True)
176 | </code></pre>
177 | 
178 | <h3 id="autosave">autosave</h3>
179 | <p>If you only want to save at the end of training, turn autosave off:</p>
180 | <pre><code class="python">exp = Experiment(name='dense_model', autosave=False)
181 | 
182 | # run long training...
183 | 
184 | # first time any logs are saved
185 | exp.save()
186 | </code></pre>
187 | 
188 | <h3 id="create_git_tag"><code>create_git_tag</code></h3>
189 | <p>Ever wanted a flashback to your code when you ran an experiment?
190 | Snapshot your code for this experiment using git tags:</p>
191 | <pre><code class="python">exp = Experiment(name='dense_model', create_git_tag=True)
192 | </code></pre>
193 | 
194 | <hr />
195 | <h2 id="methods">Methods</h2>
196 | <h3 id="tag">tag</h3>
197 | <pre><code class="python">exp.tag({k: v})
198 | </code></pre>
199 | 
200 | <p>Adds an arbitrary dictionary of tags to the experiment</p>
201 | <p><strong>Example</strong></p>
202 | <pre><code class="python">exp.tag({'dataset_name': 'imagenet_1', 'learning_rate': 0.0002})
203 | </code></pre>
204 | 
205 | <h3 id="log">log</h3>
206 | <pre><code class="python">exp.log({k:v})
207 | </code></pre>
208 | 
209 | <p>Adds a row of data to the experiments</p>
210 | <p><strong>Example</strong></p>
211 | <pre><code class="python">exp.log({'val_loss': 0.22, 'epoch_nb': 1, 'batch_nb': 12})
212 | 
213 | # you can also add other rows that have separate information
214 | exp.log({'tng_loss': 0.01})
215 | 
216 | # or even a numpy array image
217 | image = np.imread('image.png')
218 | exp.log({'fake_png': image})
219 | </code></pre>
220 | 
221 | <p><strong>Saving images Example</strong></p>
222 | <pre><code class="python"># name must have either jpg, png or jpeg in it
223 | img = np.imread('a.jpg')
224 | exp.log('test_jpg': img, 'val_err': 0.2)
225 | 
226 | # saves image to ../exp/version/media/test_0.jpg
227 | # csv has file path to that image in that cell
228 | </code></pre>
229 | 
230 | <p>To save an image, add <code>jpg</code>, <code>png</code> or <code>jpeg</code> to the key corresponding
231 | with the image array. The image must be formatted the same as skimage's
232 | <a href="http://scikit-image.org/docs/dev/api/skimage.io.html#skimage.io.imsave">imsave</a>
233 | function</p>
234 | <h3 id="argparse">argparse</h3>
235 | <pre><code class="python">exp.argparse(hparams)
236 | </code></pre>
237 | 
238 | <p>Transfers hyperparam information from Argparser or
239 | HyperOptArgumentParser</p>
240 | <p><strong>Example</strong></p>
241 | <pre><code class="python">from test_tube import HyperOptArgumentParser
242 | 
243 | # parse args
244 | parser = HyperOptArgumentParser()
245 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
246 | hparams = parser.parse_args()
247 | 
248 | # learning_rate is now a meta tag for your experiment
249 | exp.argparse(hparams)
250 | </code></pre>
251 | 
252 | <h3 id="save">save</h3>
253 | <pre><code class="python">exp.save()
254 | </code></pre>
255 | 
256 | <p>Saves the exp to disk (including images)</p>
257 | <p><strong>Example</strong></p>
258 | <pre><code class="python">exp = Experiment(name='dense_model', autosave=False)
259 | 
260 | # run long training...
261 | 
262 | # first time any logs are saved
263 | exp.save()
264 | </code></pre>
265 |               
266 |             </div>
267 |           </div>
268 |           <footer>
269 |   
270 |     <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
271 |       
272 |         <a href="../../hpc/SlurmCluster/" class="btn btn-neutral float-right" title="SlurmCluster class API">Next <span class="icon icon-circle-arrow-right"></span></a>
273 |       
274 |       
275 |         <a href="../.." class="btn btn-neutral" title="Test Tube: Easily log and tune Deep Learning experiments"><span class="icon icon-circle-arrow-left"></span> Previous</a>
276 |       
277 |     </div>
278 |   
279 | 
280 |   <hr/>
281 | 
282 |   <div role="contentinfo">
283 |     <!-- Copyright etc -->
284 |     
285 |   </div>
286 | 
287 |   Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
288 | </footer>
289 |       
290 |         </div>
291 |       </div>
292 | 
293 |     </section>
294 | 
295 |   </div>
296 | 
297 |   <div class="rst-versions" role="note" style="cursor: pointer">
298 |     <span class="rst-current-version" data-toggle="rst-current-version">
299 |       
300 |           <a href="https://github.com/williamFalcon/test_tube/" class="fa fa-github" style="float: left; color: #fcfcfc"> GitHub</a>
301 |       
302 |       
303 |         <span><a href="../.." style="color: #fcfcfc;">&laquo; Previous</a></span>
304 |       
305 |       
306 |         <span style="margin-left: 15px"><a href="../../hpc/SlurmCluster/" style="color: #fcfcfc">Next &raquo;</a></span>
307 |       
308 |     </span>
309 | </div>
310 |     <script>var base_url = '../..';</script>
311 |     <script src="../../js/theme.js" defer></script>
312 |       <script src="../../search/main.js" defer></script>
313 | 
314 | </body>
315 | </html>
316 | 


--------------------------------------------------------------------------------
/docs/hpc/SlurmCluster.md:
--------------------------------------------------------------------------------
  1 | # SlurmCluster class API
  2 | 
  3 | [[Github Code](https://github.com/williamFalcon/test-tube/blob/master/test_tube/hpc.py)]    
  4 | 
  5 | The SlurmCluster class enables hyperparameter search parallelization on a cluster managed via [Slurm workload manager](https://slurm.schedmd.com/).
  6 | 
  7 | At a high level, the SlurmCluster creates a submit script for each permutation of hyperparameters requested. If the job hits the walltime but has not completed, the SlurmManager will checkpoint the model and submit a new job to continue training using the saved weights.
  8 | 
  9 | - Here's a [full GPU PyTorch example](https://github.com/williamFalcon/test-tube/blob/master/examples/pytorch_hpc_example.py).    
 10 | - Here's a [full CPU example](https://github.com/williamFalcon/test-tube/blob/master/examples/hpc_cpu_example.py).   
 11 | 
 12 | You can instantiate a `SlurmCluster` via:   
 13 | 
 14 | ``` {.python}
 15 | from test_tube.hpc import SlurmCluster
 16 | 
 17 | # hyperparameters is a test-tube hyper params object
 18 | # see https://williamfalcon.github.io/test-tube/hyperparameter_optimization/HyperOptArgumentParser/
 19 | hyperparams = args.parse()
 20 | 
 21 | # init cluster
 22 | cluster = SlurmCluster(
 23 |     hyperparam_optimizer=hyperparams,
 24 |     log_path='/path/to/log/results/to',
 25 |     python_cmd='python3'
 26 | )
 27 | 
 28 | # let the cluster know where to email for a change in job status (ie: complete, fail, etc...)
 29 | cluster.notify_job_status(email='some@email.com', on_done=True, on_fail=True)
 30 | 
 31 | # set the job options. In this instance, we'll run 20 different models
 32 | # each with its own set of hyperparameters giving each one 1 GPU (ie: taking up 20 GPUs)
 33 | cluster.per_experiment_nb_gpus = 1
 34 | cluster.per_experiment_nb_nodes = 1
 35 | 
 36 | # we'll request 10GB of memory per node
 37 | cluster.memory_mb_per_node = 10000
 38 | 
 39 | # set a walltime of 10 minues
 40 | cluster.job_time = '10:00'
 41 | 
 42 | # 1 minute before walltime is up, SlurmCluster will launch a continuation job and kill this job.
 43 | # you must provide your own loading and saving function which the cluster object will call
 44 | cluster.minutes_to_checkpoint_before_walltime = 1
 45 | 
 46 | # run the models on the cluster
 47 | cluster.optimize_parallel_cluster_gpu(train, nb_trials=20, job_name='first_tt_batch', job_display_name='my_batch')
 48 | ```   
 49 | 
 50 | ------------------------------------------------------------------------
 51 | 
 52 | ## init options
 53 | 
 54 | ### `hyperparam_optimizer`
 55 | 
 56 | A [HyperOptArgumentParser](https://williamfalcon.github.io/test-tube/hyperparameter_optimization/HyperOptArgumentParser/) object 
 57 | which contains all permutations of model hyperparameters to run.   
 58 | 
 59 | ### `log_path`
 60 | 
 61 | Path to save the slurm scripts, error logs and out logs created. Usually this would be the experiments folder path where test tube saves [Experiment](https://williamfalcon.github.io/test-tube/experiment_tracking/experiment/) information.
 62 | 
 63 | ### `python_cmd`
 64 | 
 65 | This is the command that starts the python program. Normally it is:   
 66 | 
 67 | ``` {.python}
 68 | # python 2
 69 | python main.py   
 70 | 
 71 | # python 3   
 72 | python3 main.py
 73 | ```   
 74 | 
 75 | ### `enable_log_err`
 76 | 
 77 | If true, saves slurm error logs to the path at *log_path*. If anything goes wrong in your job, you'll find the error here.   
 78 | 
 79 | ### `enable_log_out`
 80 | 
 81 | If true, saves slurm output logs to the path at *log_path*. This file contains all outputs that would show up on the console normally.   
 82 | 
 83 | ### `test_tube_exp_name`
 84 | 
 85 | When this is given, it structures the files in a nice format to fit with the folder structure of the [Experiment](https://williamfalcon.github.io/test-tube/experiment_tracking/experiment/) object's output.
 86 | 
 87 | ## Properties   
 88 | 
 89 | `job_time`  
 90 | String. Walltime requested. Examples:    
 91 | ```{.python}    
 92 | # 1 hour and 10 minutes    
 93 | cluster.job_time = '1:10:00'
 94 | 
 95 | # 1 day and 1 hour and 10 minutes    
 96 | cluster.job_time = '1-1:10:00'
 97 | 
 98 | # 1 day and 1 hour and 10 minutes    
 99 | cluster.job_time = '25:10:00'   
100 | 
101 | # 10 minutes    
102 | cluster.job_time = '10:00'   
103 | 
104 | # 10 seconds    
105 | cluster.job_time = '10'   
106 | ```   
107 | 
108 | `minutes_to_checkpoint_before_walltime`  
109 | Int. Minutes before walltime when a continuation job will be auto-submitted. 
110 | ```{.python}
111 | cluster.job_time = '10:00'   
112 | cluster.minutes_to_checkpoint_before_walltime = 2
113 | 
114 | # New job will be submited to continue training after 8 minutes of the job running.      
115 | ```     
116 | 
117 | `per_experiment_nb_gpus`  
118 | Int. Number of GPUs each job will get.   
119 | ```{.python}
120 | # EACH job will get 2 GPUs (ie: if a model runs over two GPUs at the same time).   
121 | cluster.per_experiment_nb_gpus = 2  
122 | ```     
123 | 
124 | `per_experiment_nb_cpus`  
125 | Int. Number of CPUs each job will get.   
126 | ```{.python}
127 | cluster.per_experiment_nb_cpus = 1 
128 | ```     
129 | 
130 | `per_experiment_nb_nodes`  
131 | Int. Number of nodes each job will get.    
132 | ```{.python}
133 | cluster.per_experiment_nb_nodes = 1 
134 | ```     
135 | 
136 | `gpu_type`   
137 | String. Gpu type requested. Example:   
138 | ```{.python}
139 | cluster.gpu_type = '1080ti'   
140 | ```     
141 | 
142 | ------------------------------------------------------------------------
143 | 
144 | ## Methods
145 | 
146 | ### `set_checkpoint_save_function`
147 | 
148 | ``` {.python}
149 | cluster.set_checkpoint_save_function(fx, kwargs)    
150 | ```
151 | 
152 | Called if the model isn't finished training *minutes_to_checkpoint_before_walltime* before the walltime. If walltime = '15:00' and minutes_to_checkpoint_before_walltime = '1:00' the SlurmCluster will call your save function after 14 minutes of training.   
153 | 
154 | - ```fx``` A python function.  
155 | - ```kwargs``` Dictionary where keys are the literal argument names to the function. Dictionary values are the values of the arguments.     
156 | 
157 | **Example**
158 | 
159 | ``` {.python}
160 | def my_save_function(arg_1, arg_k):  
161 |     # ... save my model here    
162 |     
163 | cluster.set_checkpoint_save_function(my_save_function, kwargs={'arg_1': 'whatever', 'arg_k': 'you_want'})    
164 | 
165 | ```
166 | 
167 | ### `set_checkpoint_load_function`
168 | 
169 | ``` {.python}
170 | cluster.set_checkpoint_load_function(fx, kwargs)    
171 | ```
172 | 
173 | Called internally when a job is auto-submitted by the SlurmCluster to give your program a chance to load the model weights or whatever you need to continue training.  
174 | This will call your load function immediately whenever you call this method AND training is continuing. 
175 | 
176 | - ```fx``` A python function.  
177 | - ```kwargs``` Dictionary where keys are the literal argument names to the function. Dictionary values are the values of the arguments.   
178 | 
179 | **Example**
180 | 
181 | ``` {.python}
182 | def my_load_function(arg_1, arg_k):  
183 |     # ... restore my model here    
184 |     
185 | cluster.set_checkpoint_save_function(my_load_function, kwargs={'arg_1': 'whatever', 'arg_k': 'you_want'})    
186 | 
187 | ```
188 | 
189 | ### `add_slurm_cmd`
190 | 
191 | ``` {.python}
192 | cluster.add_slurm_cmd(cmd, value, comment)
193 | ```
194 | 
195 | Adds whatever Slurm command you need manually to the generated script. All possible commands are listed [here](https://slurm.schedmd.com/pdfs/summary.pdf).
196 | 
197 | - ```cmd``` String with the bash command.   
198 | - ```value``` String value for the command. Numericals need to be in single quotes ```'1'```  
199 | - ```comment``` String with the command comment.  
200 | 
201 | **Example**
202 | 
203 | ``` {.python}
204 | cluster.add_slurm_cmd(cmd='cpus-per-task', value='1', comment='nb cpus per task')
205 | 
206 | # the above command will add an entry like this to the slurm script   
207 | 
208 | # #nb cpus per task
209 | # #SBATCH --cpus-per-task=1
210 | # ############
211 | 
212 | ```    
213 | 
214 | ### `add_command`
215 | 
216 | ``` {.python}
217 | cluster.add_command(cmd)    
218 | ```
219 | 
220 | Adds arbitrary bash commands to the script. Use this to activate conda environments, install packages, whatever else you would think about calling on bash.    
221 | 
222 | - ```cmd``` String with your bash command.   
223 | 
224 | **Example**
225 | 
226 | 
227 | ``` {.python}
228 | # load the anaconda package on the launch node   
229 | cluster.add_command('module load anaconda')   
230 | 
231 | # activate the environment on the launch node   
232 | cluster.add_command('source activate myCondaEnv')   
233 | ```   
234 | 
235 | ### `load_modules`
236 | 
237 | ``` {.python}
238 | cluster.load_modules(modules)  
239 | ```
240 | 
241 | Loads modules needed to run the job. Your Slurm documentation should have a list of available modules. You can also get those by running ```module avail```.   
242 | - ```modules``` Array of module names.    
243 | 
244 | **Example**
245 | 
246 | 
247 | ``` {.python}
248 | cluster.load_modules([
249 |     'python-3',
250 |     'anaconda3'
251 | ])   
252 | ```   
253 | 
254 | ### `notify_job_status`
255 | 
256 | ``` {.python}
257 | cluster.notify_job_status(email, on_done, on_fail)  
258 | ```
259 | 
260 | Loads modules needed to run the job. Your Slurm documentation should have a list of available modules. You can also get those by running ```module avail```.   
261 | 
262 | - ```email``` String. Email address to get notifications.       
263 | - ```on_done``` Boolean. If true, you'll get an email when the job completes.      
264 | - ```on_fail``` Boolean. If true, you'll get an email if the job fails.    
265 | 
266 | **Example**
267 | 
268 | 
269 | ``` {.python}
270 | cluster.notify_job_status(email='some@email.com', on_done=True, on_fail=True)   
271 | ```   
272 | 
273 | ### `optimize_parallel_cluster_gpu`
274 | 
275 | ``` {.python}
276 | cluster.optimize_parallel_cluster_gpu(train_function, nb_trials, job_name, job_display_name=None)  
277 | ```
278 | 
279 | Launches the hyperparameter search across the cluster nodes.      
280 | - ```train_function``` The entry point to start your training routine.   
281 | - ```nb_trials``` Number of trials to launch. This is the number of hyperparameter configurations to train over.   
282 | - ```job_name``` Folder name where the slurm scripts will save to. This should be the same as your [Experiment](https://williamfalcon.github.io/test-tube/experiment_tracking/experiment/) name.      
283 | - ```job_display_name``` Visible name when slurm lists running jobs (ie: through ```squeue -u user_name```).  This should be really short (if using a test tube Experiment, it'll put the experiment version at the end).   
284 | 
285 | **Example**
286 | 
287 | 
288 | ``` {.python}
289 | def main(hparams, cluster, return_dict):   
290 |     # do your own generic training code here... 
291 |     # init model
292 |     model = model_build(hparams)    
293 |     
294 |     # set the load and save fxs
295 |     cluster.set_checkpoint_save_function(fx, {})
296 |     cluster.set_checkpoint_load_function(fx, {})
297 |     
298 |     # train ...
299 |     
300 | 
301 | cluster.optimize_parallel_cluster_gpu(main, nb_trials=20, job_name='my_job', job_display_name='mj')    
302 | ```    
303 | 
304 | Now if you get the job information, you'll see this:    
305 | ``` {.bash}   
306 | (conda_env) [user@node dir]$ squeue -u my_name
307 |              JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
308 |             104040       all  mjv0   my_name  R      58:22      1 nodeName
309 |             104041       all  mjv1   my_name  R      58:22      1 nodeName
310 |             104042       all  mjv2   my_name  R      58:22      1 nodeName
311 |             104043       all  mjv3   my_name  R      58:22      1 nodeName
312 | ```    
313 | 
314 | ### `optimize_parallel_cluster_cpu`
315 | 
316 | ``` {.python}
317 | cluster.optimize_parallel_cluster_cpu(train_function, nb_trials, job_name, job_display_name=None)  
318 | ```
319 | 
320 | Launches the hyperparameter search across the cluster nodes using cpus.      
321 | - ```train_function``` The entry point to start your training routine.   
322 | - ```nb_trials``` Number of trials to launch. This is the number of hyperparameter configurations to train over.   
323 | - ```job_name``` Folder name where the slurm scripts will save to. This should be the same as your [Experiment](https://williamfalcon.github.io/test-tube/experiment_tracking/experiment/) name.      
324 | - ```job_display_name``` Visible name when slurm lists running jobs (ie: through ```squeue -u user_name```).  This should be really short (if using a test tube Experiment, it'll put the experiment version at the end).   
325 | 
326 | **Example**
327 | 
328 | 
329 | ``` {.python}
330 | def main(hparams, cluster, return_dict):   
331 |     # do your own generic training code here... 
332 |     # init model
333 |     model = model_build(hparams)    
334 |     
335 |     # set the load and save fxs
336 |     cluster.set_checkpoint_save_function(fx, {})
337 |     cluster.set_checkpoint_load_function(fx, {})
338 |     
339 |     # train ...
340 |     
341 | 
342 | cluster.optimize_parallel_cluster_cpu(main, nb_trials=20, job_name='my_job', job_display_name='mj')    
343 | ```    
344 | 
345 | Now if you get the job information, you'll see this:    
346 | ``` {.bash}   
347 | (conda_env) [user@node dir]$ squeue -u my_name
348 |              JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
349 |             104040       all  mjv0   my_name  R      58:22      1 nodeName
350 |             104041       all  mjv1   my_name  R      58:22      1 nodeName
351 |             104042       all  mjv2   my_name  R      58:22      1 nodeName
352 |             104043       all  mjv3   my_name  R      58:22      1 nodeName
353 | ```    
354 | 


--------------------------------------------------------------------------------
/site/hyperparameter_optimization/HyperOptArgumentParser/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
  3 | <!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
  4 | <head>
  5 |   <meta charset="utf-8">
  6 |   <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 |   
  9 |   
 10 |   <link rel="shortcut icon" href="../../img/favicon.ico">
 11 |   <title>HyperOptArgumentParser class API - Test tube Documentation</title>
 12 |   <link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
 13 | 
 14 |   <link rel="stylesheet" href="../../css/theme.css" type="text/css" />
 15 |   <link rel="stylesheet" href="../../css/theme_extra.css" type="text/css" />
 16 |   <link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/github.min.css">
 17 |   
 18 |   <script>
 19 |     // Current page data
 20 |     var mkdocs_page_name = "HyperOptArgumentParser class API";
 21 |     var mkdocs_page_input_path = "hyperparameter_optimization/HyperOptArgumentParser.md";
 22 |     var mkdocs_page_url = null;
 23 |   </script>
 24 |   
 25 |   <script src="../../js/jquery-2.1.1.min.js" defer></script>
 26 |   <script src="../../js/modernizr-2.8.3.min.js" defer></script>
 27 |   <script src="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/highlight.min.js"></script>
 28 |   <script>hljs.initHighlightingOnLoad();</script> 
 29 |   
 30 | </head>
 31 | 
 32 | <body class="wy-body-for-nav" role="document">
 33 | 
 34 |   <div class="wy-grid-for-nav">
 35 | 
 36 |     
 37 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
 38 |       <div class="wy-side-nav-search">
 39 |         <a href="../.." class="icon icon-home"> Test tube Documentation</a>
 40 |         <div role="search">
 41 |   <form id ="rtd-search-form" class="wy-form" action="../../search.html" method="get">
 42 |     <input type="text" name="q" placeholder="Search docs" title="Type search term here" />
 43 |   </form>
 44 | </div>
 45 |       </div>
 46 | 
 47 |       <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
 48 | 	<ul class="current">
 49 | 	  
 50 |           
 51 |             <li class="toctree-l1">
 52 | 		
 53 |     <a class="" href="../..">Test Tube: Easily log and tune Deep Learning experiments</a>
 54 | 	    </li>
 55 |           
 56 |             <li class="toctree-l1">
 57 | 		
 58 |     <span class="caption-text">Experiment tracking</span>
 59 |     <ul class="subnav">
 60 |                 <li class="">
 61 |                     
 62 |     <a class="" href="../../experiment_tracking/experiment/">Experiment class API</a>
 63 |                 </li>
 64 |     </ul>
 65 | 	    </li>
 66 |           
 67 |             <li class="toctree-l1">
 68 | 		
 69 |     <span class="caption-text">Hpc</span>
 70 |     <ul class="subnav">
 71 |                 <li class="">
 72 |                     
 73 |     <a class="" href="../../hpc/SlurmCluster/">SlurmCluster class API</a>
 74 |                 </li>
 75 |     </ul>
 76 | 	    </li>
 77 |           
 78 |             <li class="toctree-l1">
 79 | 		
 80 |     <span class="caption-text">Hyperparameter optimization</span>
 81 |     <ul class="subnav">
 82 |                 <li class=" current">
 83 |                     
 84 |     <a class="current" href="./">HyperOptArgumentParser class API</a>
 85 |     <ul class="subnav">
 86 |             
 87 |     <li class="toctree-l3"><a href="#hyperoptargumentparser-class-api">HyperOptArgumentParser class API</a></li>
 88 |     
 89 |         <ul>
 90 |         
 91 |             <li><a class="toctree-l4" href="#init-options">init options</a></li>
 92 |         
 93 |             <li><a class="toctree-l4" href="#methods">Methods</a></li>
 94 |         
 95 |         </ul>
 96 |     
 97 | 
 98 |     </ul>
 99 |                 </li>
100 |     </ul>
101 | 	    </li>
102 |           
103 |         </ul>
104 |       </div>
105 |       &nbsp;
106 |     </nav>
107 | 
108 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
109 | 
110 |       
111 |       <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
112 |         <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
113 |         <a href="../..">Test tube Documentation</a>
114 |       </nav>
115 | 
116 |       
117 |       <div class="wy-nav-content">
118 |         <div class="rst-content">
119 |           <div role="navigation" aria-label="breadcrumbs navigation">
120 |   <ul class="wy-breadcrumbs">
121 |     <li><a href="../..">Docs</a> &raquo;</li>
122 |     
123 |       
124 |         
125 |           <li>Hyperparameter optimization &raquo;</li>
126 |         
127 |       
128 |     
129 |     <li>HyperOptArgumentParser class API</li>
130 |     <li class="wy-breadcrumbs-aside">
131 |       
132 |         <a href="https://github.com/williamFalcon/test_tube/edit/master/docs/hyperparameter_optimization/HyperOptArgumentParser.md"
133 |           class="icon icon-github"> Edit on GitHub</a>
134 |       
135 |     </li>
136 |   </ul>
137 |   <hr/>
138 | </div>
139 |           <div role="main">
140 |             <div class="section">
141 |               
142 |                 <h1 id="hyperoptargumentparser-class-api">HyperOptArgumentParser class API</h1>
143 | <p>[<a href="https://github.com/williamFalcon/test-tube/blob/master/test_tube/argparse_hopt.py">Github Code</a>]    </p>
144 | <p>The HyperOptArgumentParser is a subclass of python's
145 | <a href="https://docs.python.org/3/library/argparse.html">argparse</a>, with added
146 | finctionality to change parameters on the fly as determined by a
147 | sampling strategy.</p>
148 | <p>You can instantiate an <code>HyperOptArgumentParser</code> via:</p>
149 | <pre><code class="python">from test_tube import HyperOptArgumentParser
150 | 
151 | # subclass of argparse
152 | parser = HyperOptArgumentParser(strategy='random_search')
153 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
154 | 
155 | # let's enable optimizing over the number of layers in the network
156 | parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
157 | 
158 | # and tune the number of units in each layer
159 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=10)
160 | 
161 | # compile (because it's argparse underneath)
162 | hparams = parser.parse_args()
163 | 
164 | # run 20 trials of random search over the hyperparams
165 | for hparam_trial in hparams.trials(20):
166 |     train_network(hparam_trial)
167 | </code></pre>
168 | 
169 | <hr />
170 | <h2 id="init-options">init options</h2>
171 | <h3 id="strategy"><code>strategy</code></h3>
172 | <p>Use either <a href="http://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf">random
173 | search</a>
174 | or <a href="http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html">grid
175 | search</a>
176 | for tuning:</p>
177 | <pre><code class="python">parser = HyperOptArgumentParser(strategy='grid_search')
178 | </code></pre>
179 | 
180 | <hr />
181 | <h2 id="methods">Methods</h2>
182 | <p>All the functionality from argparse works but we've added the following
183 | functionality:</p>
184 | <h3 id="opt_list"><code>opt_list</code></h3>
185 | <pre><code class="python">parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
186 | </code></pre>
187 | 
188 | <p>Enables searching over a list of values for this parameter. The tunable
189 | values ONLY replace the argparse values when running a hyperparameter
190 | optimization search. This is on purpose so your code doesn't have to
191 | change when you want to tune it.</p>
192 | <p><strong>Example</strong></p>
193 | <pre><code class="python">parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
194 | hparams = parser.parse_args()
195 | # hparams.nb_layers = 2
196 | 
197 | for trial in hparams.trials(2):
198 |     # trial.nb_layers is now a value in [2, 4, 8]
199 |     # but hparams.nb_layers is still 2
200 | </code></pre>
201 | 
202 | <h3 id="opt_range"><code>opt_range</code></h3>
203 | <pre><code class="python">parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=8, log_base=None)
204 | </code></pre>
205 | 
206 | <p>Enables searching over a range of values chosen randomly using the
207 | <code>nb_samples</code> given. The tunable values <em>only</em> replace the argparse
208 | values when running a hyperparameter optimization search. This is on
209 | purpose so your code doesn't have to change when you want to tune it.</p>
210 | <p>If <code>log_base</code> is set to a positive number, it will randomly search over
211 | a log scale, where the log base is <code>log_base</code>. This is better for search
212 | over several orders of magnitude efficiently.</p>
213 | <p><strong>Example</strong></p>
214 | <pre><code class="python">parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=8)
215 | hparams = parser.parse_args()
216 | # hparams.neurons = 50
217 | 
218 | for trial in hparams.trials(2):
219 |     # trial.nb_layers is now a value in [100, 200, 300, 400, 500, 600 700, 800]
220 |     # but hparams.neurons is still 50
221 | </code></pre>
222 | 
223 | <h3 id="json_config"><code>json_config</code></h3>
224 | <pre><code class="python">parser.json_config('--config', default='example.json')
225 | </code></pre>
226 | 
227 | <p>Replaces default values in the parser with those read from the json file</p>
228 | <p><strong>Example</strong></p>
229 | <p><em>example.json</em></p>
230 | <pre><code class="json">{
231 |     &quot;learning_rate&quot;: 200
232 | }
233 | </code></pre>
234 | 
235 | <pre><code class="python">parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
236 | parser.json_config('--config', default='example.json')
237 | hparams = parser.parse_args()
238 | 
239 | # hparams.learning_rate = 200
240 | </code></pre>
241 | 
242 | <h3 id="trials">trials</h3>
243 | <pre><code class="python">trial_generator = hparams.trials(2)
244 | </code></pre>
245 | 
246 | <p>Computes the trials needed for these experiments and serves them via a
247 | generator</p>
248 | <p><strong>Example</strong></p>
249 | <pre><code class="python">hparams = parser.parse_args()
250 | for trial_hparams in hparams.trials(2):
251 |     # trial_hparams now has values sampled from the training routine
252 | </code></pre>
253 | 
254 | <h3 id="optimize_parallel_gpu"><code>optimize_parallel_gpu</code></h3>
255 | <pre><code class="python">hparams = parser.parse_args()
256 | hparams.optimize_parallel_gpu(function_to_optimize, gpu_ids=['1', '0, 2'])
257 | </code></pre>
258 | 
259 | <p>Parallelize the trials across <code>nb_workers</code> processes. Auto assign the
260 | correct gpus. Argument passed into the <code>function_to_optimize</code> is the
261 | <code>trial_params</code> argument and the gpu_ids.</p>
262 | <p><strong>Example</strong></p>
263 | <pre><code class="python"># parallelize tuning on 2 gpus
264 | # this will place each trial in n into a given gpu
265 | def train_main(trial_params, gpu_ids):
266 |     # train your model, etc here...
267 | 
268 | hparams = parser.parse_args()
269 | hparams.optimize_parallel_gpu(train_main, gpu_ids=['1', '0, 2'])
270 | 
271 | # at the end of the optimize_parallel function, all 20 trials will be completed
272 | # in this case by running 10 sets of 2 trials in parallel
273 | </code></pre>
274 | 
275 | <h3 id="optimize_parallel_cpu"><code>optimize_parallel_cpu</code></h3>
276 | <pre><code class="python">hparams = parser.parse_args()
277 | hparams.optimize_parallel_cpu(function_to_optimize, nb_trials=20, nb_workers=2)
278 | </code></pre>
279 | 
280 | <p>Parallelize the trials across <code>nb_workers</code> cpus. Argument passed into
281 | the <code>function_to_optimize</code> is the <code>trial_params</code> argument.</p>
282 | <p><strong>Example</strong></p>
283 | <pre><code class="python"># parallelize tuning on 2 cpus
284 | # this will place each trial in n into a given gpu
285 | def train_main(trial_params):
286 |     # train your model, etc here...
287 | 
288 | hparams = parser.parse_args()
289 | hparams.optimize_parallel_cpu(train_main, nb_trials=20, nb_workers=2)
290 | 
291 | # at the end of the optimize_parallel function, all 20 trials will be completed
292 | # in this case by running 10 sets of 2 trials in parallel
293 | </code></pre>
294 |               
295 |             </div>
296 |           </div>
297 |           <footer>
298 |   
299 |     <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
300 |       
301 |       
302 |         <a href="../../hpc/SlurmCluster/" class="btn btn-neutral" title="SlurmCluster class API"><span class="icon icon-circle-arrow-left"></span> Previous</a>
303 |       
304 |     </div>
305 |   
306 | 
307 |   <hr/>
308 | 
309 |   <div role="contentinfo">
310 |     <!-- Copyright etc -->
311 |     
312 |   </div>
313 | 
314 |   Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
315 | </footer>
316 |       
317 |         </div>
318 |       </div>
319 | 
320 |     </section>
321 | 
322 |   </div>
323 | 
324 |   <div class="rst-versions" role="note" style="cursor: pointer">
325 |     <span class="rst-current-version" data-toggle="rst-current-version">
326 |       
327 |           <a href="https://github.com/williamFalcon/test_tube/" class="fa fa-github" style="float: left; color: #fcfcfc"> GitHub</a>
328 |       
329 |       
330 |         <span><a href="../../hpc/SlurmCluster/" style="color: #fcfcfc;">&laquo; Previous</a></span>
331 |       
332 |       
333 |     </span>
334 | </div>
335 |     <script>var base_url = '../..';</script>
336 |     <script src="../../js/theme.js" defer></script>
337 |       <script src="../../search/main.js" defer></script>
338 | 
339 | </body>
340 | </html>
341 | 


--------------------------------------------------------------------------------
/test_tube/argparse_hopt.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import math
  4 | import os
  5 | import random
  6 | import re
  7 | import traceback
  8 | from argparse import ArgumentParser
  9 | from copy import deepcopy
 10 | from gettext import gettext as _
 11 | from multiprocessing import Pool, Queue
 12 | from time import sleep
 13 | 
 14 | import numpy as np
 15 | 
 16 | from .hyper_opt_utils import strategies
 17 | 
 18 | # needed to work with pytorch multiprocess
 19 | try:
 20 |     import torch
 21 |     import multiprocessing
 22 |     # multiprocessing.set_start_method('spawn', force=True)
 23 | except ModuleNotFoundError:
 24 |     pass
 25 | 
 26 | 
 27 | def optimize_parallel_gpu_private(args):
 28 |     trial_params, train_function = args[0], args[1]
 29 | 
 30 |     # get set of gpu ids
 31 |     gpu_id_set = g_gpu_id_q.get(block=True)
 32 | 
 33 |     try:
 34 | 
 35 |         # enable the proper gpus
 36 |         os.environ["CUDA_VISIBLE_DEVICES"] = gpu_id_set
 37 | 
 38 |         # run training fx on the specific gpus
 39 |         results = train_function(trial_params, gpu_id_set)
 40 | 
 41 |         return [trial_params, results]
 42 | 
 43 |     except Exception as e:
 44 |         print('Caught exception in worker thread', e)
 45 | 
 46 |         # This prints the type, value, and stack trace of the
 47 |         # current exception being handled.
 48 |         traceback.print_exc()
 49 |         return [trial_params, None]
 50 | 
 51 |     finally:
 52 |         g_gpu_id_q.put(gpu_id_set)
 53 | 
 54 | 
 55 | def optimize_parallel_cpu_private(args):
 56 |     trial_params, train_function = args[0], args[1]
 57 | 
 58 |     sleep(random.randint(0, 4))
 59 | 
 60 |     # run training fx on the specific gpus
 61 |     results = train_function(trial_params)
 62 | 
 63 |     # True = completed
 64 |     return [trial_params, results]
 65 | 
 66 | 
 67 | class HyperOptArgumentParser(ArgumentParser):
 68 |     """
 69 |     Subclass of argparse ArgumentParser which adds optional calls to sample from lists or ranges
 70 |     Also enables running optimizations across parallel processes
 71 |     """
 72 | 
 73 |     # these are commands injected by test tube from cluster operations
 74 |     TRIGGER_CMD = 'test_tube_from_cluster_hopt'
 75 |     SLURM_CMD_PATH = 'test_tube_slurm_cmd_path'
 76 |     SLURM_EXP_CMD = 'hpc_exp_number'
 77 |     SLURM_LOAD_CMD = 'test_tube_do_checkpoint_load'
 78 |     CMD_MAP = {
 79 |         TRIGGER_CMD: bool,
 80 |         SLURM_CMD_PATH: str,
 81 |         SLURM_EXP_CMD: int,
 82 |         SLURM_LOAD_CMD: bool
 83 |     }
 84 | 
 85 |     def __init__(self, strategy='grid_search', **kwargs):
 86 |         """
 87 | 
 88 |         :param strategy: 'grid_search', 'random_search'
 89 |         :param enabled:
 90 |         :param experiment:
 91 |         :param kwargs:
 92 |         """
 93 |         ArgumentParser.__init__(self, **kwargs)
 94 | 
 95 |         self.strategy = strategy
 96 |         self.trials = []
 97 |         self.parsed_args = None
 98 |         self.opt_args = {}
 99 |         self.json_config_arg_name = None
100 |         self.pool = None
101 | 
102 |     def __getstate__(self):
103 |         # capture what is normally pickled
104 |         state = self.__dict__.copy()
105 | 
106 |         # remove all functions from the namespace
107 |         clean_state = {}
108 |         for k, v in state.items():
109 |             if not hasattr(v, '__call__'):
110 |                 clean_state[k] = v
111 | 
112 |         # what we return here will be stored in the pickle
113 |         return clean_state
114 | 
115 |     def __setstate__(self, newstate):
116 |         # re-instate our __dict__ state from the pickled state
117 |         self.__dict__.update(newstate)
118 | 
119 |     def add_argument(self, *args, **kwargs):
120 |         super(HyperOptArgumentParser, self).add_argument(*args, **kwargs)
121 | 
122 |     def opt_list(self, *args, **kwargs):
123 |         options = kwargs.pop("options", None)
124 |         tunable = kwargs.pop("tunable", False)
125 |         self.add_argument(*args, **kwargs)
126 |         for i in range(len(args)):
127 |             arg_name = args[i]
128 |             self.opt_args[arg_name] = OptArg(obj_id=arg_name, opt_values=options, tunable=tunable)
129 | 
130 |     def opt_range(
131 |             self,
132 |             *args,
133 |             **kwargs
134 |     ):
135 |         low = kwargs.pop("low", None)
136 |         high = kwargs.pop("high", None)
137 |         arg_type = kwargs["type"]
138 |         nb_samples = kwargs.pop("nb_samples", 10)
139 |         tunable = kwargs.pop("tunable", False)
140 |         log_base = kwargs.pop("log_base", None)
141 | 
142 |         self.add_argument(*args, **kwargs)
143 |         arg_name = args[-1]
144 |         self.opt_args[arg_name] = OptArg(
145 |             obj_id=arg_name,
146 |             opt_values=[low, high],
147 |             arg_type=arg_type,
148 |             nb_samples=nb_samples,
149 |             tunable=tunable,
150 |             log_base=log_base,
151 |         )
152 | 
153 |     def json_config(self, *args, **kwargs):
154 |         self.add_argument(*args, **kwargs)
155 |         self.json_config_arg_name = re.sub('-', '', args[-1])
156 | 
157 |     def __parse_args(self, args=None, namespace=None):
158 |         # allow bypassing certain missing params which other parts of test tube may introduce
159 |         args, argv = self.parse_known_args(args, namespace)
160 |         args, argv = self.__whitelist_cluster_commands(args, argv)
161 |         if argv:
162 |             msg = _('unrecognized arguments: %s')
163 |             self.error(msg % ' '.join(argv))
164 |         return args
165 | 
166 |     def __whitelist_cluster_commands(self, args, argv):
167 |         parsed = {}
168 | 
169 |         # build a dict where key = arg, value = value of the arg or None if just a flag
170 |         for i, arg_candidate in enumerate(argv):
171 |             arg = None
172 |             value = None
173 | 
174 |             # only look at --keys
175 |             if '--' not in arg_candidate:
176 |                 continue
177 | 
178 |             # skip items not on the white list
179 |             if arg_candidate[2:] not in HyperOptArgumentParser.CMD_MAP:
180 |                 continue
181 | 
182 |             arg = arg_candidate[2:]
183 |             # pull out the value of the argument if given
184 |             if i + 1 <= len(argv) - 1:
185 |                 if '--' not in argv[i + 1]:
186 |                     value = argv[i + 1]
187 | 
188 |                 if arg is not None:
189 |                     parsed[arg] = value
190 |             else:
191 |                 if arg is not None:
192 |                     parsed[arg] = value
193 | 
194 |         # add the whitelist cmds to the args
195 |         all_values = set()
196 |         for k, v in args.__dict__.items():
197 |             all_values.add(k)
198 |             all_values.add(v)
199 | 
200 |         for arg, v in parsed.items():
201 |             v_parsed = self.__parse_primitive_arg_val(v)
202 |             all_values.add(v)
203 |             all_values.add(arg)
204 |             args.__setattr__(arg, v_parsed)
205 | 
206 |         # make list with only the unknown args
207 |         unk_args = []
208 |         for arg in argv:
209 |             arg_candidate = re.sub('--', '', arg)
210 |             is_bool = arg_candidate == 'True' or arg_candidate == 'False'
211 |             if is_bool: continue
212 | 
213 |             if arg_candidate not in all_values:
214 |                 unk_args.append(arg)
215 | 
216 |         # when no bad args are left, return none to be consistent with super api
217 |         if len(unk_args) == 0:
218 |             unk_args = None
219 | 
220 |         # add hpc_exp_number if not passed in so we can never get None
221 |         if HyperOptArgumentParser.SLURM_EXP_CMD not in args:
222 |             args.__setattr__(HyperOptArgumentParser.SLURM_EXP_CMD, None)
223 | 
224 |         return args, unk_args
225 | 
226 |     def __parse_primitive_arg_val(self, val):
227 |         if val is None:
228 |             return True
229 |         try:
230 |             return int(val)
231 |         except ValueError:
232 |             try:
233 |                 return float(val)
234 |             except ValueError:
235 |                 return val
236 | 
237 |     def parse_args(self, args=None, namespace=None):
238 |         # call superclass arg first
239 |         results = self.__parse_args(args, namespace)
240 | 
241 |         # extract vals
242 |         old_args = vars(results)
243 | 
244 |         # override with json args if given
245 |         if self.json_config_arg_name and old_args[self.json_config_arg_name]:
246 |             for arg, v in self.__read_json_config(old_args[self.json_config_arg_name]).items():
247 |                 old_args[arg] = v
248 | 
249 |         # track args
250 |         self.parsed_args = deepcopy(old_args)
251 |         # attach optimization fx
252 |         old_args['trials'] = self.opt_trials
253 |         old_args['optimize_parallel'] = self.optimize_parallel
254 |         old_args['optimize_parallel_gpu'] = self.optimize_parallel_gpu
255 |         old_args['optimize_parallel_cpu'] = self.optimize_parallel_cpu
256 |         old_args['generate_trials'] = self.generate_trials
257 |         old_args['optimize_trials_parallel_gpu'] = self.optimize_trials_parallel_gpu
258 | 
259 |         return TTNamespace(**old_args)
260 | 
261 |     def __read_json_config(self, file_path):
262 |         with open(file_path) as json_data:
263 |             json_args = json.load(json_data)
264 |             return json_args
265 | 
266 |     def opt_trials(self, num):
267 |         self.trials = strategies.generate_trials(
268 |             strategy=self.strategy,
269 |             flat_params=self.__flatten_params(self.opt_args),
270 |             nb_trials=num,
271 |         )
272 | 
273 |         for trial in self.trials:
274 |             ns = self.__namespace_from_trial(trial)
275 |             yield ns
276 | 
277 |     def generate_trials(self, nb_trials):
278 |         trials = strategies.generate_trials(
279 |             strategy=self.strategy,
280 |             flat_params=self.__flatten_params(self.opt_args),
281 |             nb_trials=nb_trials,
282 |         )
283 | 
284 |         trials = [self.__namespace_from_trial(x) for x in trials]
285 |         return trials
286 | 
287 |     def optimize_parallel_gpu(
288 |             self,
289 |             train_function,
290 |             gpu_ids,
291 |             max_nb_trials=None,
292 |     ):
293 |         """
294 |         Runs optimization across gpus with cuda drivers
295 |         :param train_function:
296 |         :param max_nb_trials:
297 |         :param gpu_ids: List of strings like: ['0', '1, 3']
298 |         :return:
299 |         """
300 |         self.trials = strategies.generate_trials(
301 |             strategy=self.strategy,
302 |             flat_params=self.__flatten_params(self.opt_args),
303 |             nb_trials=max_nb_trials,
304 |         )
305 | 
306 |         self.trials = [(self.__namespace_from_trial(x), train_function) for x in self.trials]
307 | 
308 |         # build q of gpu ids so we can use them in each process
309 |         # this is thread safe so each process can pull out a gpu id, run its task and put it back when done
310 |         if self.pool is None:
311 |             gpu_q = Queue()
312 |             for gpu_id in gpu_ids:
313 |                 gpu_q.put(gpu_id)
314 | 
315 |             # called by the Pool when a process starts
316 |             def init(local_gpu_q):
317 |                 global g_gpu_id_q
318 |                 g_gpu_id_q = local_gpu_q
319 | 
320 |             # init a pool with the nb of worker threads we want
321 |             nb_workers = len(gpu_ids)
322 |             self.pool = Pool(processes=nb_workers, initializer=init, initargs=(gpu_q,))
323 | 
324 |         # apply parallelization
325 |         results = self.pool.map(optimize_parallel_gpu_private, self.trials)
326 |         return results
327 | 
328 |     def optimize_trials_parallel_gpu(
329 |             self,
330 |             train_function,
331 |             nb_trials,
332 |             trials,
333 |             gpu_ids,
334 |             nb_workers=4,
335 |     ):
336 |         """
337 |         Runs optimization across gpus with cuda drivers
338 |         :param train_function:
339 |         :param nb_trials:
340 |         :param gpu_ids: List of strings like: ['0', '1, 3']
341 |         :param nb_workers:
342 |         :return:
343 |         """
344 |         self.trials = trials
345 |         self.trials = [(x, train_function) for x in self.trials]
346 | 
347 |         # build q of gpu ids so we can use them in each process
348 |         # this is thread safe so each process can pull out a gpu id, run its task and put it back when done
349 |         if self.pool is None:
350 |             gpu_q = Queue()
351 |             for gpu_id in gpu_ids:
352 |                 gpu_q.put(gpu_id)
353 | 
354 |             # called by the Pool when a process starts
355 |             def init(local_gpu_q):
356 |                 global g_gpu_id_q
357 |                 g_gpu_id_q = local_gpu_q
358 | 
359 |             # init a pool with the nb of worker threads we want
360 |             self.pool = Pool(processes=nb_workers, initializer=init, initargs=(gpu_q,))
361 | 
362 |         # apply parallelization
363 |         results = self.pool.map(optimize_parallel_gpu_private, self.trials)
364 |         return results
365 | 
366 |     def optimize_parallel_cpu(
367 |             self,
368 |             train_function,
369 |             nb_trials,
370 |             nb_workers=4,
371 |     ):
372 |         """
373 |         Runs optimization across n cpus
374 |         :param train_function:
375 |         :param nb_trials:
376 |         :param nb_workers:
377 |         :return:
378 |         """
379 |         self.trials = strategies.generate_trials(
380 |             strategy=self.strategy,
381 |             flat_params=self.__flatten_params(self.opt_args),
382 |             nb_trials=nb_trials
383 |         )
384 | 
385 |         self.trials = [(self.__namespace_from_trial(x), train_function) for x in self.trials]
386 | 
387 |         # init a pool with the nb of worker threads we want
388 |         if self.pool is None:
389 |             self.pool = Pool(processes=nb_workers)
390 | 
391 |         # apply parallelization
392 |         results = self.pool.map(optimize_parallel_cpu_private, self.trials)
393 |         return results
394 | 
395 |     def optimize_parallel(
396 |             self,
397 |             train_function,
398 |             nb_trials,
399 |             nb_parallel=4,
400 |     ):
401 |         self.trials = strategies.generate_trials(
402 |             strategy=self.strategy,
403 |             flat_params=self.__flatten_params(self.opt_args),
404 |             nb_trials=nb_trials
405 |         )
406 | 
407 |         # nb of runs through all parallel systems
408 |         fork_batches = [
409 |             self.trials[i:i + nb_parallel] for i in range(0, len(self.trials), nb_parallel)
410 |         ]
411 | 
412 |         for fork_batch in fork_batches:
413 |             children = []
414 | 
415 |             # run n parallel forks
416 |             for parallel_nb, trial in enumerate(fork_batch):
417 | 
418 |                 # q up the trial and convert to a namespace
419 |                 ns = self.__namespace_from_trial(trial)
420 | 
421 |                 # split new fork
422 |                 pid = os.fork()
423 | 
424 |                 # when the process is a parent
425 |                 if pid:
426 |                     children.append(pid)
427 | 
428 |                 # when process is a child
429 |                 else:
430 |                     # slight delay to make sure we don't overwrite over test tube log versions
431 |                     sleep(parallel_nb * 0.5)
432 |                     train_function(ns, parallel_nb)
433 |                     os._exit(0)
434 | 
435 |             for i, child in enumerate(children):
436 |                 os.waitpid(child, 0)
437 | 
438 |     def __namespace_from_trial(self, trial):
439 |         trial_dict = {d['name']: d['val'] for d in trial}
440 |         for k, v in self.parsed_args.items():
441 |             if k not in trial_dict:
442 |                 trial_dict[k] = v
443 | 
444 |         return TTNamespace(**trial_dict)
445 | 
446 |     def __flatten_params(self, params):
447 |         """
448 |         Turns a list of parameters with values into a flat tuple list of lists
449 |         so we can permute
450 |         :param params:
451 |         :return:
452 |         """
453 |         flat_params = []
454 |         for i, (opt_name, opt_arg) in enumerate(params.items()):
455 |             if opt_arg.tunable:
456 |                 clean_name = opt_name.strip('-')
457 |                 clean_name = re.sub('-', '_', clean_name)
458 |                 param_groups = []
459 |                 for val in opt_arg.opt_values:
460 |                     param_groups.append({'idx': i, 'val': val, 'name': clean_name})
461 |                 flat_params.append(param_groups)
462 |         return flat_params
463 | 
464 | 
465 | class TTNamespace(argparse.Namespace):
466 | 
467 |     def __str__(self):
468 |         result = '-' * 100 + '\nHyperparameters:\n'
469 |         for k, v in self.__dict__.items():
470 |             result += '{0:20}: {1}\n'.format(k, v)
471 |         return result
472 | 
473 |     def __getstate__(self):
474 |         # capture what is normally pickled
475 |         state = self.__dict__.copy()
476 | 
477 |         # remove all functions from the namespace
478 |         clean_state = {}
479 |         for k, v in state.items():
480 |             if not hasattr(v, '__call__'):
481 |                 clean_state[k] = v
482 | 
483 |         # what we return here will be stored in the pickle
484 |         return clean_state
485 | 
486 |     def __setstate__(self, newstate):
487 |         # re-instate our __dict__ state from the pickled state
488 |         self.__dict__.update(newstate)
489 | 
490 | 
491 | class OptArg(object):
492 |     def __init__(
493 |             self,
494 |             obj_id,
495 |             opt_values,
496 |             arg_type=None,
497 |             nb_samples=None,
498 |             tunable=False,
499 |             log_base=None,
500 |     ):
501 |         self.opt_values = opt_values
502 |         self.obj_id = obj_id
503 |         self.tunable = tunable
504 | 
505 |         # convert range to list of values
506 |         if nb_samples:
507 |             low, high = opt_values
508 | 
509 |             if log_base is None:
510 |                 # random search on uniform scale
511 |                 if arg_type is int:
512 |                     self.opt_values = [int(_) for _ in np.random.choice(np.arange(low, high), nb_samples, replace=False)]
513 |                 elif arg_type is float:
514 |                     self.opt_values = np.random.uniform(low, high, nb_samples)
515 |             else:
516 |                 # random search on log scale with specified base
517 |                 assert high >= low > 0, "`opt_values` must be positive to do log-scale search."
518 | 
519 |                 log_low, log_high = math.log(low, log_base), math.log(high, log_base)
520 | 
521 |                 self.opt_values = log_base ** np.random.uniform(log_low, log_high, nb_samples)
522 | 
523 | 


--------------------------------------------------------------------------------
/test_tube/hpc.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import os
  3 | import signal
  4 | import sys
  5 | import time
  6 | import traceback
  7 | from subprocess import call
  8 | 
  9 | from .argparse_hopt import HyperOptArgumentParser
 10 | 
 11 | 
 12 | def exit():
 13 |     time.sleep(1)
 14 |     os._exit(1)
 15 | 
 16 | 
 17 | class AbstractCluster(object):
 18 | 
 19 |     RUN_CMD = 'sbatch'
 20 |     def __init__(
 21 |             self,
 22 |             hyperparam_optimizer=None,
 23 |             log_path=None,
 24 |             python_cmd='python3',
 25 |             enable_log_err=True,
 26 |             enable_log_out=True,
 27 |     ):
 28 |         self.hyperparam_optimizer = hyperparam_optimizer
 29 |         self.log_path = log_path
 30 | 
 31 |         self.enable_log_err = enable_log_err
 32 |         self.enable_log_out = enable_log_out
 33 |         self.slurm_files_log_path = None
 34 |         self.err_log_path = None
 35 |         self.out_log_path = None
 36 |         self.modules = []
 37 |         self.script_name = os.path.realpath(sys.argv[0])
 38 |         self.job_time = '15:00'
 39 |         self.minutes_to_checkpoint_before_walltime = 5
 40 |         self.per_experiment_nb_gpus = 1
 41 |         self.per_experiment_nb_cpus = 1
 42 |         self.per_experiment_nb_nodes = 1
 43 |         self.memory_mb_per_node = 2000
 44 |         self.email = None
 45 |         self.notify_on_end = False
 46 |         self.notify_on_fail = False
 47 |         self.job_name = None
 48 |         self.python_cmd = python_cmd
 49 |         self.gpu_type = None
 50 |         self.on_gpu = False
 51 |         self.call_load_checkpoint = False
 52 |         self.commands = []
 53 |         self.slurm_commands = []
 54 |         self.hpc_exp_number = 0
 55 | 
 56 |         # these are set via getters and setters so we can use a BaseManager which can be shared across processes
 57 |         self.checkpoint_save_function = None
 58 |         self.checkpoint_load_function = None
 59 | 
 60 |         # detect when this was called because a slurm object started a hopt.
 61 |         # if true, remove the flag so tt logs don't show it
 62 |         if hyperparam_optimizer is not None:
 63 | 
 64 |             self.is_from_slurm_object = HyperOptArgumentParser.TRIGGER_CMD in vars(self.hyperparam_optimizer) and vars(self.hyperparam_optimizer)[HyperOptArgumentParser.TRIGGER_CMD] == True
 65 |             if self.is_from_slurm_object:
 66 |                 self.hyperparam_optimizer.__delattr__(HyperOptArgumentParser.TRIGGER_CMD)
 67 | 
 68 |             self.call_load_checkpoint = HyperOptArgumentParser.SLURM_LOAD_CMD in vars(self.hyperparam_optimizer)
 69 |             if self.call_load_checkpoint:
 70 |                 self.hyperparam_optimizer.__delattr__(HyperOptArgumentParser.SLURM_LOAD_CMD)
 71 | 
 72 |             self.hpc_exp_number = self.hyperparam_optimizer.hpc_exp_number
 73 | 
 74 |     def set_checkpoint_save_function(self, fx, kwargs):
 75 |         self.checkpoint_save_function = [fx, kwargs]
 76 | 
 77 |     def get_checkpoint_save_function(self):
 78 |         return self.checkpoint_save_function
 79 | 
 80 |     def set_checkpoint_load_function(self, fx, kwargs):
 81 |         # if we were passed in the load flag, then we call the load function as soon as it's added
 82 |         if self.call_load_checkpoint:
 83 |             fx(**kwargs)
 84 | 
 85 |         self.checkpoint_load_function = [fx, kwargs]
 86 | 
 87 |     def get_checkpoint_load_function(self):
 88 |         return self.checkpoint_load_function
 89 | 
 90 |     def add_slurm_cmd(self, cmd, value, comment):
 91 |         self.slurm_commands.append((cmd, value, comment))
 92 | 
 93 |     def add_command(self, cmd):
 94 |         self.commands.append(cmd)
 95 | 
 96 |     def load_modules(self, modules):
 97 |         self.modules = modules
 98 | 
 99 |     def notify_job_status(self, email, on_done, on_fail):
100 |         self.email = email
101 |         self.notify_on_end = on_done
102 |         self.notify_on_fail = on_fail
103 | 
104 |     def optimize_parallel_cluster(self, train_function, nb_trials, job_name):
105 |         raise NotImplementedError
106 | 
107 |     def optimize_parallel_slurm(self, job_name, output_file, error_file, job_time, nb_gpus, nb_nodes, memory, notifications_email, gpu_types):
108 |         pass
109 | 
110 | 
111 | class SlurmCluster(AbstractCluster):
112 |     def __init__(self, *args, **kwargs):
113 |         super(SlurmCluster, self).__init__(*args, **kwargs)
114 | 
115 |     def optimize_parallel_cluster_gpu(
116 |             self,
117 |             train_function,
118 |             nb_trials,
119 |             job_name,
120 |             enable_auto_resubmit=False,
121 |             job_display_name=None
122 |     ):
123 |         if job_display_name is None:
124 |             job_display_name = job_name
125 | 
126 |         self.__optimize_parallel_cluster_internal(train_function, nb_trials, job_name, job_display_name,
127 |                                                   enable_auto_resubmit, on_gpu=True)
128 | 
129 |     def optimize_parallel_cluster_cpu(
130 |             self,
131 |             train_function,
132 |             nb_trials,
133 |             job_name,
134 |             enable_auto_resubmit=False,
135 |             job_display_name=None
136 |     ):
137 |         if job_display_name is None:
138 |             job_display_name = job_name
139 | 
140 |         self.__optimize_parallel_cluster_internal(train_function, nb_trials, job_name, job_display_name,
141 |                                                   enable_auto_resubmit, on_gpu=False)
142 | 
143 |     def __optimize_parallel_cluster_internal(
144 |             self,
145 |             train_function,
146 |             nb_trials,
147 |             job_name,
148 |             job_display_name,
149 |             enable_auto_resubmit,
150 |             on_gpu
151 |     ):
152 |         """
153 |         Runs optimization on the attached cluster
154 |         :param train_function:
155 |         :param nb_trials:
156 |         :param job_name:
157 |         :return:
158 |         """
159 |         self.job_name = job_name
160 |         self.job_display_name = job_display_name
161 |         self.on_gpu = on_gpu
162 |         self.enable_auto_resubmit = enable_auto_resubmit
163 | 
164 |         # layout logging structure
165 |         self.__layout_logging_dir()
166 | 
167 |         if self.is_from_slurm_object:
168 |             # Script is called by slurm: it's an actual experiment.
169 |             self.__run_experiment(train_function)
170 |         else:
171 |             # Launcher script. Generate trials and launch jobs.
172 | 
173 |             # generate hopt trials
174 |             trials = self.hyperparam_optimizer.generate_trials(nb_trials)
175 | 
176 |             # get the max test tube exp version so far if it's there
177 |             scripts_path = os.path.join(self.log_path, 'slurm_out_logs')
178 |             next_trial_version = self.__get_max_trial_version(scripts_path)
179 | 
180 |             # for each trial, generate a slurm command
181 |             for i, trial_params in enumerate(trials):
182 |                 exp_i = i + next_trial_version
183 |                 self.schedule_experiment(trial_params, exp_i)
184 | 
185 |     def schedule_experiment(self, trial_params, exp_i):
186 |         timestamp = datetime.datetime.now().strftime("%Y-%m-%d__%H-%M-%S")
187 |         timestamp = 'trial_{}_{}'.format(exp_i, timestamp)
188 | 
189 |         # generate command
190 |         slurm_cmd_script_path = os.path.join(self.slurm_files_log_path, '{}_slurm_cmd.sh'.format(timestamp))
191 |         slurm_cmd = self.__build_slurm_command(trial_params, slurm_cmd_script_path, timestamp, exp_i, self.on_gpu)
192 |         self.__save_slurm_cmd(slurm_cmd, slurm_cmd_script_path)
193 | 
194 |         # run script to launch job
195 |         print('\nlaunching exp...')
196 |         result = call('{} {}'.format(AbstractCluster.RUN_CMD, slurm_cmd_script_path), shell=True)
197 |         if result == 0:
198 |             print('launched exp ', slurm_cmd_script_path)
199 |         else:
200 |             print('launch failed...')
201 | 
202 |     def slurm_time_to_seconds(self, job_time):
203 |         seconds = 0
204 |         time_component = job_time
205 |         if '-' in job_time:
206 |             days, time_component = job_time.split('-')
207 |             seconds += int(days) * 24 * 60 * 60
208 | 
209 |         time_components = time_component.split(':')
210 |         if len(time_components) == 3:
211 |             hours, minutes, secs = time_components
212 |             time_seconds = int(secs) + (int(minutes) * 60) + (int(hours) * 60 * 60)
213 |             seconds += time_seconds
214 | 
215 |         elif len(time_components) == 2:
216 |             minutes, secs = time_components
217 |             time_seconds = int(secs) + (int(minutes) * 60)
218 |             seconds += time_seconds
219 | 
220 |         elif len(time_components) == 1:
221 |             secs = time_components[0]
222 |             seconds += int(secs)
223 | 
224 |         return seconds
225 | 
226 |     def call_save(self):
227 |         print('calling save')
228 | 
229 |         # if save function was passed, call it
230 |         if self.get_checkpoint_save_function() is not None:
231 |             save_fx, kwargs = self.get_checkpoint_save_function()
232 |             save_fx(**kwargs)
233 | 
234 |             # if we're here, the job didn't finish and we were given a save function
235 |             # if we were given a load function, then schedule the program again and pass in the load function
236 |             if self.get_checkpoint_load_function() is not None:
237 |                 job_id = os.environ['SLURM_JOB_ID']
238 |                 cmd = 'scontrol requeue {}'.format(job_id)
239 | 
240 |                 print('\nrequeing job {}...'.format(job_id))
241 |                 result = call(cmd, shell=True)
242 |                 if result == 0:
243 |                     print('requeued exp ', job_id)
244 |                 else:
245 |                     print('requeue failed...')
246 | 
247 |         # stop program
248 |         os._exit(0)
249 | 
250 |     def sig_handler(self, signum, frame):
251 |         print("caught signal", signum)
252 |         self.call_save()
253 |         # sys.exit(-1)
254 | 
255 |     # ------------------------
256 |     # HANDLE SLURM SIGNALS
257 |     # ------------------------
258 |     def term_handler(self, signum, frame):
259 |         print("bypassing sigterm")
260 | 
261 |     def __run_experiment(self, train_function):
262 |         if self.enable_auto_resubmit:
263 |             print('setting signal')
264 |             signal.signal(signal.SIGUSR1, self.sig_handler)
265 |             signal.signal(signal.SIGTERM, self.term_handler)
266 | 
267 |         try:
268 |             # run training
269 |             train_function(self.hyperparam_optimizer, self)
270 | 
271 |         except Exception as e:
272 |             print('Caught exception in worker thread', e)
273 | 
274 |             # This prints the type, value, and stack trace of the
275 |             # current exception being handled.
276 |             traceback.print_exc()
277 |             raise SystemExit
278 | 
279 |     def __save_slurm_cmd(self, slurm_cmd, slurm_cmd_script_path):
280 |         with open(slurm_cmd_script_path, mode='w') as file:
281 |             file.write(slurm_cmd)
282 | 
283 |     def __get_max_trial_version(self, path):
284 |         files = os.listdir(path)
285 |         version_files = [f for f in files if 'trial_' in f]
286 |         if len(version_files) > 0:
287 |             # regex out everything except file version for ve
288 |             versions = [int(f_name.split('_')[1]) for f_name in version_files]
289 |             max_version = max(versions)
290 |             return max_version + 1
291 |         else:
292 |             return 0
293 | 
294 |     def __layout_logging_dir(self):
295 |         """
296 |         Generates dir structure for logging errors and outputs
297 |         :return:
298 |         """
299 | 
300 |         # format the logging folder path
301 |         slurm_out_path = os.path.join(self.log_path, self.job_name)
302 | 
303 |         self.log_path = slurm_out_path
304 | 
305 |         # if we have a test tube name, make the folder and set as the logging destination
306 |         if not os.path.exists(slurm_out_path):
307 |             os.makedirs(slurm_out_path)
308 | 
309 |         # when err logging is enabled, build add the err logging folder
310 |         if self.enable_log_err:
311 |             err_path = os.path.join(slurm_out_path, 'slurm_err_logs')
312 |             if not os.path.exists(err_path):
313 |                 os.makedirs(err_path)
314 |             self.err_log_path = err_path
315 | 
316 |         # when out logging is enabled, build add the out logging folder
317 |         if self.enable_log_out:
318 |             out_path = os.path.join(slurm_out_path, 'slurm_out_logs')
319 |             if not os.path.exists(out_path):
320 |                 os.makedirs(out_path)
321 |             self.out_log_path = out_path
322 | 
323 |         # place where slurm files log to
324 |         self.slurm_files_log_path = os.path.join(slurm_out_path, 'slurm_scripts')
325 |         if not os.path.exists(self.slurm_files_log_path):
326 |             os.makedirs(self.slurm_files_log_path)
327 | 
328 |     def __get_hopt_params(self, trial):
329 |         """
330 |         Turns hopt trial into script params
331 |         :param trial:
332 |         :return:
333 |         """
334 | 
335 |         params = []
336 |         for k in trial.__dict__:
337 |             v = trial.__dict__[k]
338 | 
339 |             # don't add None params
340 |             if v is None or v is False:
341 |                 continue
342 | 
343 |             # put everything in quotes except bools
344 |             if self.__should_escape(v):
345 |                 cmd = '--{} \"{}\"'.format(k, v)
346 |             else:
347 |                 cmd = '--{} {}'.format(k, v)
348 |             params.append(cmd)
349 | 
350 |         # this arg lets the hyperparameter optimizer do its thing
351 |         params.append('--{}'.format(HyperOptArgumentParser.TRIGGER_CMD))
352 | 
353 |         full_cmd = ' '.join(params)
354 |         return full_cmd
355 | 
356 |     def __should_escape(self, v):
357 |         v = str(v)
358 |         return '[' in v or ';' in v or ' ' in v
359 | 
360 |     def __build_slurm_command(self, trial, slurm_cmd_script_path, timestamp, exp_i, on_gpu):
361 |         sub_commands = []
362 | 
363 |         command =[
364 |             '#!/bin/bash',
365 |             '#',
366 |             '# Auto-generated by test-tube (https://github.com/williamFalcon/test-tube)',
367 |             '#################\n'
368 |         ]
369 |         sub_commands.extend(command)
370 | 
371 |         # add job name
372 |         job_with_version = '{}v{}'.format(self.job_display_name, exp_i)
373 |         command = [
374 |             '# set a job name',
375 |             '#SBATCH --job-name={}'.format(job_with_version),
376 |             '#################\n',
377 |         ]
378 |         sub_commands.extend(command)
379 | 
380 |         # add out output
381 |         if self.enable_log_out:
382 |             out_path = os.path.join(self.out_log_path, '{}_slurm_output_%j.out'.format(timestamp))
383 |             command = [
384 |                 '# a file for job output, you can check job progress',
385 |                 '#SBATCH --output={}'.format(out_path),
386 |                 '#################\n',
387 |             ]
388 |             sub_commands.extend(command)
389 | 
390 |         # add err output
391 |         if self.enable_log_err:
392 |             err_path = os.path.join(self.err_log_path, '{}_slurm_output_%j.err'.format(timestamp))
393 |             command = [
394 |                 '# a file for errors',
395 |                 '#SBATCH --error={}'.format(err_path),
396 |                 '#################\n',
397 |             ]
398 |             sub_commands.extend(command)
399 | 
400 |         # add job time
401 |         command = [
402 |             '# time needed for job',
403 |             '#SBATCH --time={}'.format(self.job_time),
404 |             '#################\n'
405 |         ]
406 |         sub_commands.extend(command)
407 | 
408 |         # add nb of gpus
409 |         if self.per_experiment_nb_gpus > 0 and on_gpu:
410 |             command = [
411 |                 '# gpus per node',
412 |                 '#SBATCH --gres=gpu:{}'.format(self.per_experiment_nb_gpus),
413 |                 '#################\n'
414 |             ]
415 |             if self.gpu_type is not None:
416 |                 command = [
417 |                     '# gpus per node',
418 |                     '#SBATCH --gres=gpu:{}:{}'.format(self.gpu_type, self.per_experiment_nb_gpus),
419 |                     '#################\n'
420 |                 ]
421 |             sub_commands.extend(command)
422 | 
423 |         # add nb of cpus if not looking at a gpu job
424 |         if self.per_experiment_nb_cpus > 0:
425 |             command = [
426 |                 '# cpus per job',
427 |                 '#SBATCH --cpus-per-task={}'.format(self.per_experiment_nb_cpus),
428 |                 '#################\n'
429 |             ]
430 |             sub_commands.extend(command)
431 | 
432 |         # pick nb nodes
433 |         command = [
434 |             '# number of requested nodes',
435 |             '#SBATCH --nodes={}'.format(self.per_experiment_nb_nodes),
436 |             '#################\n'
437 |         ]
438 |         sub_commands.extend(command)
439 | 
440 |         # pick memory per node
441 |         command = [
442 |             '# memory per node',
443 |             '#SBATCH --mem={}'.format(self.memory_mb_per_node),
444 |             '#################\n'
445 |         ]
446 |         sub_commands.extend(command)
447 | 
448 |         # add signal command to catch job termination
449 |         command = [
450 |             '# slurm will send a signal this far out before it kills the job',
451 |             f'#SBATCH --signal=USR1@{self.minutes_to_checkpoint_before_walltime * 60}',
452 |             '#################\n'
453 |         ]
454 | 
455 |         sub_commands.extend(command)
456 | 
457 |         # Subscribe to email if requested
458 |         mail_type = []
459 |         if self.notify_on_end:
460 |             mail_type.append('END')
461 |         if self.notify_on_fail:
462 |             mail_type.append('FAIL')
463 |         if len(mail_type) > 0:
464 |             mail_type_query = [
465 |                 '# Have SLURM send you an email when the job ends or fails',
466 |                 '#SBATCH --mail-type={}'.format(','.join(mail_type))
467 |             ]
468 |             sub_commands.extend(mail_type_query)
469 | 
470 |             email_query = [
471 |                 '#SBATCH --mail-user={}'.format(self.email),
472 |             ]
473 |             sub_commands.extend(email_query)
474 | 
475 |         # add custom sbatch commands
476 |         sub_commands.append('\n')
477 |         for (cmd, value, comment) in self.slurm_commands:
478 |             comment = '# {}'.format(comment)
479 |             cmd = '#SBATCH --{}={}'.format(cmd, value)
480 |             spaces = '#################\n'
481 |             sub_commands.extend([comment, cmd, spaces])
482 | 
483 |         # load modules
484 |         sub_commands.append('\n')
485 |         for module in self.modules:
486 |             cmd = 'module load {}'.format(module)
487 |             sub_commands.append(cmd)
488 | 
489 |         # remove spaces before the hash
490 |         sub_commands = [x.lstrip() for x in sub_commands]
491 | 
492 |         # add additional commands
493 |         for cmd in self.commands:
494 |             sub_commands.append(cmd)
495 |             sub_commands.append('\n')
496 | 
497 |         # add run command
498 |         trial_args = self.__get_hopt_params(trial)
499 |         trial_args = '{} --{} {} --{} {}'.format(trial_args,
500 |                                                  HyperOptArgumentParser.SLURM_CMD_PATH,
501 |                                                  slurm_cmd_script_path,
502 |                                                  HyperOptArgumentParser.SLURM_EXP_CMD,
503 |                                                  exp_i)
504 | 
505 |         cmd = 'srun {} {} {}'.format(self.python_cmd, self.script_name, trial_args)
506 |         sub_commands.append(cmd)
507 | 
508 |         # build full command with empty lines in between
509 |         full_command = '\n'.join(sub_commands)
510 |         return full_command
511 | 


--------------------------------------------------------------------------------
/site/hpc/SlurmCluster/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
  3 | <!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
  4 | <head>
  5 |   <meta charset="utf-8">
  6 |   <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 |   
  9 |   
 10 |   <link rel="shortcut icon" href="../../img/favicon.ico">
 11 |   <title>SlurmCluster class API - Test tube Documentation</title>
 12 |   <link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
 13 | 
 14 |   <link rel="stylesheet" href="../../css/theme.css" type="text/css" />
 15 |   <link rel="stylesheet" href="../../css/theme_extra.css" type="text/css" />
 16 |   <link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/github.min.css">
 17 |   
 18 |   <script>
 19 |     // Current page data
 20 |     var mkdocs_page_name = "SlurmCluster class API";
 21 |     var mkdocs_page_input_path = "hpc/SlurmCluster.md";
 22 |     var mkdocs_page_url = null;
 23 |   </script>
 24 |   
 25 |   <script src="../../js/jquery-2.1.1.min.js" defer></script>
 26 |   <script src="../../js/modernizr-2.8.3.min.js" defer></script>
 27 |   <script src="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/highlight.min.js"></script>
 28 |   <script>hljs.initHighlightingOnLoad();</script> 
 29 |   
 30 | </head>
 31 | 
 32 | <body class="wy-body-for-nav" role="document">
 33 | 
 34 |   <div class="wy-grid-for-nav">
 35 | 
 36 |     
 37 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
 38 |       <div class="wy-side-nav-search">
 39 |         <a href="../.." class="icon icon-home"> Test tube Documentation</a>
 40 |         <div role="search">
 41 |   <form id ="rtd-search-form" class="wy-form" action="../../search.html" method="get">
 42 |     <input type="text" name="q" placeholder="Search docs" title="Type search term here" />
 43 |   </form>
 44 | </div>
 45 |       </div>
 46 | 
 47 |       <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
 48 | 	<ul class="current">
 49 | 	  
 50 |           
 51 |             <li class="toctree-l1">
 52 | 		
 53 |     <a class="" href="../..">Test Tube: Easily log and tune Deep Learning experiments</a>
 54 | 	    </li>
 55 |           
 56 |             <li class="toctree-l1">
 57 | 		
 58 |     <span class="caption-text">Experiment tracking</span>
 59 |     <ul class="subnav">
 60 |                 <li class="">
 61 |                     
 62 |     <a class="" href="../../experiment_tracking/experiment/">Experiment class API</a>
 63 |                 </li>
 64 |     </ul>
 65 | 	    </li>
 66 |           
 67 |             <li class="toctree-l1">
 68 | 		
 69 |     <span class="caption-text">Hpc</span>
 70 |     <ul class="subnav">
 71 |                 <li class=" current">
 72 |                     
 73 |     <a class="current" href="./">SlurmCluster class API</a>
 74 |     <ul class="subnav">
 75 |             
 76 |     <li class="toctree-l3"><a href="#slurmcluster-class-api">SlurmCluster class API</a></li>
 77 |     
 78 |         <ul>
 79 |         
 80 |             <li><a class="toctree-l4" href="#init-options">init options</a></li>
 81 |         
 82 |             <li><a class="toctree-l4" href="#properties">Properties</a></li>
 83 |         
 84 |             <li><a class="toctree-l4" href="#methods">Methods</a></li>
 85 |         
 86 |         </ul>
 87 |     
 88 | 
 89 |     </ul>
 90 |                 </li>
 91 |     </ul>
 92 | 	    </li>
 93 |           
 94 |             <li class="toctree-l1">
 95 | 		
 96 |     <span class="caption-text">Hyperparameter optimization</span>
 97 |     <ul class="subnav">
 98 |                 <li class="">
 99 |                     
100 |     <a class="" href="../../hyperparameter_optimization/HyperOptArgumentParser/">HyperOptArgumentParser class API</a>
101 |                 </li>
102 |     </ul>
103 | 	    </li>
104 |           
105 |         </ul>
106 |       </div>
107 |       &nbsp;
108 |     </nav>
109 | 
110 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
111 | 
112 |       
113 |       <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
114 |         <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
115 |         <a href="../..">Test tube Documentation</a>
116 |       </nav>
117 | 
118 |       
119 |       <div class="wy-nav-content">
120 |         <div class="rst-content">
121 |           <div role="navigation" aria-label="breadcrumbs navigation">
122 |   <ul class="wy-breadcrumbs">
123 |     <li><a href="../..">Docs</a> &raquo;</li>
124 |     
125 |       
126 |         
127 |           <li>Hpc &raquo;</li>
128 |         
129 |       
130 |     
131 |     <li>SlurmCluster class API</li>
132 |     <li class="wy-breadcrumbs-aside">
133 |       
134 |         <a href="https://github.com/williamFalcon/test_tube/edit/master/docs/hpc/SlurmCluster.md"
135 |           class="icon icon-github"> Edit on GitHub</a>
136 |       
137 |     </li>
138 |   </ul>
139 |   <hr/>
140 | </div>
141 |           <div role="main">
142 |             <div class="section">
143 |               
144 |                 <h1 id="slurmcluster-class-api">SlurmCluster class API</h1>
145 | <p>[<a href="https://github.com/williamFalcon/test-tube/blob/master/test_tube/hpc.py">Github Code</a>]    </p>
146 | <p>The SlurmCluster class enables hyperparameter search parallelization on a cluster managed via <a href="https://slurm.schedmd.com/">Slurm workload manager</a>.</p>
147 | <p>At a high level, the SlurmCluster creates a submit script for each permutation of hyperparameters requested. If the job hits the walltime but has not completed, the SlurmManager will checkpoint the model and submit a new job to continue training using the saved weights.</p>
148 | <ul>
149 | <li>Here's a <a href="https://github.com/williamFalcon/test-tube/blob/master/examples/pytorch_hpc_example.py">full GPU PyTorch example</a>.    </li>
150 | <li>Here's a <a href="https://github.com/williamFalcon/test-tube/blob/master/examples/hpc_cpu_example.py">full CPU example</a>.   </li>
151 | </ul>
152 | <p>You can instantiate a <code>SlurmCluster</code> via:   </p>
153 | <pre><code class="python">from test_tube.hpc import SlurmCluster
154 | 
155 | # hyperparameters is a test-tube hyper params object
156 | # see https://williamfalcon.github.io/test-tube/hyperparameter_optimization/HyperOptArgumentParser/
157 | hyperparams = args.parse()
158 | 
159 | # init cluster
160 | cluster = SlurmCluster(
161 |     hyperparam_optimizer=hyperparams,
162 |     log_path='/path/to/log/results/to',
163 |     python_cmd='python3'
164 | )
165 | 
166 | # let the cluster know where to email for a change in job status (ie: complete, fail, etc...)
167 | cluster.notify_job_status(email='some@email.com', on_done=True, on_fail=True)
168 | 
169 | # set the job options. In this instance, we'll run 20 different models
170 | # each with its own set of hyperparameters giving each one 1 GPU (ie: taking up 20 GPUs)
171 | cluster.per_experiment_nb_gpus = 1
172 | cluster.per_experiment_nb_nodes = 1
173 | 
174 | # we'll request 10GB of memory per node
175 | cluster.memory_mb_per_node = 10000
176 | 
177 | # set a walltime of 10 minues
178 | cluster.job_time = '10:00'
179 | 
180 | # 1 minute before walltime is up, SlurmCluster will launch a continuation job and kill this job.
181 | # you must provide your own loading and saving function which the cluster object will call
182 | cluster.minutes_to_checkpoint_before_walltime = 1
183 | 
184 | # run the models on the cluster
185 | cluster.optimize_parallel_cluster_gpu(train, nb_trials=20, job_name='first_tt_batch', job_display_name='my_batch')
186 | </code></pre>
187 | 
188 | <hr />
189 | <h2 id="init-options">init options</h2>
190 | <h3 id="hyperparam_optimizer"><code>hyperparam_optimizer</code></h3>
191 | <p>A <a href="https://williamfalcon.github.io/test-tube/hyperparameter_optimization/HyperOptArgumentParser/">HyperOptArgumentParser</a> object 
192 | which contains all permutations of model hyperparameters to run.   </p>
193 | <h3 id="log_path"><code>log_path</code></h3>
194 | <p>Path to save the slurm scripts, error logs and out logs created. Usually this would be the experiments folder path where test tube saves <a href="https://williamfalcon.github.io/test-tube/experiment_tracking/experiment/">Experiment</a> information.</p>
195 | <h3 id="python_cmd"><code>python_cmd</code></h3>
196 | <p>This is the command that starts the python program. Normally it is:   </p>
197 | <pre><code class="python"># python 2
198 | python main.py   
199 | 
200 | # python 3   
201 | python3 main.py
202 | </code></pre>
203 | 
204 | <h3 id="enable_log_err"><code>enable_log_err</code></h3>
205 | <p>If true, saves slurm error logs to the path at <em>log_path</em>. If anything goes wrong in your job, you'll find the error here.   </p>
206 | <h3 id="enable_log_out"><code>enable_log_out</code></h3>
207 | <p>If true, saves slurm output logs to the path at <em>log_path</em>. This file contains all outputs that would show up on the console normally.   </p>
208 | <h3 id="test_tube_exp_name"><code>test_tube_exp_name</code></h3>
209 | <p>When this is given, it structures the files in a nice format to fit with the folder structure of the <a href="https://williamfalcon.github.io/test-tube/experiment_tracking/experiment/">Experiment</a> object's output.</p>
210 | <h2 id="properties">Properties</h2>
211 | <p><code>job_time</code><br />
212 | String. Walltime requested. Examples:    </p>
213 | <pre><code class="python"># 1 hour and 10 minutes    
214 | cluster.job_time = '1:10:00'
215 | 
216 | # 1 day and 1 hour and 10 minutes    
217 | cluster.job_time = '1-1:10:00'
218 | 
219 | # 1 day and 1 hour and 10 minutes    
220 | cluster.job_time = '25:10:00'   
221 | 
222 | # 10 minutes    
223 | cluster.job_time = '10:00'   
224 | 
225 | # 10 seconds    
226 | cluster.job_time = '10'   
227 | </code></pre>
228 | 
229 | <p><code>minutes_to_checkpoint_before_walltime</code><br />
230 | Int. Minutes before walltime when a continuation job will be auto-submitted. </p>
231 | <pre><code class="python">cluster.job_time = '10:00'   
232 | cluster.minutes_to_checkpoint_before_walltime = 2
233 | 
234 | # New job will be submited to continue training after 8 minutes of the job running.      
235 | </code></pre>
236 | 
237 | <p><code>per_experiment_nb_gpus</code><br />
238 | Int. Number of GPUs each job will get.   </p>
239 | <pre><code class="python"># EACH job will get 2 GPUs (ie: if a model runs over two GPUs at the same time).   
240 | cluster.per_experiment_nb_gpus = 2  
241 | </code></pre>
242 | 
243 | <p><code>per_experiment_nb_cpus</code><br />
244 | Int. Number of CPUs each job will get.   </p>
245 | <pre><code class="python">cluster.per_experiment_nb_cpus = 1 
246 | </code></pre>
247 | 
248 | <p><code>per_experiment_nb_nodes</code><br />
249 | Int. Number of nodes each job will get.    </p>
250 | <pre><code class="python">cluster.per_experiment_nb_nodes = 1 
251 | </code></pre>
252 | 
253 | <p><code>gpu_type</code> <br />
254 | String. Gpu type requested. Example:   </p>
255 | <pre><code class="python">cluster.gpu_type = '1080ti'   
256 | </code></pre>
257 | 
258 | <hr />
259 | <h2 id="methods">Methods</h2>
260 | <h3 id="set_checkpoint_save_function"><code>set_checkpoint_save_function</code></h3>
261 | <pre><code class="python">cluster.set_checkpoint_save_function(fx, kwargs)    
262 | </code></pre>
263 | 
264 | <p>Called if the model isn't finished training <em>minutes_to_checkpoint_before_walltime</em> before the walltime. If walltime = '15:00' and minutes_to_checkpoint_before_walltime = '1:00' the SlurmCluster will call your save function after 14 minutes of training.   </p>
265 | <ul>
266 | <li><code>fx</code> A python function.  </li>
267 | <li><code>kwargs</code> Dictionary where keys are the literal argument names to the function. Dictionary values are the values of the arguments.     </li>
268 | </ul>
269 | <p><strong>Example</strong></p>
270 | <pre><code class="python">def my_save_function(arg_1, arg_k):  
271 |     # ... save my model here    
272 | 
273 | cluster.set_checkpoint_save_function(my_save_function, kwargs={'arg_1': 'whatever', 'arg_k': 'you_want'})    
274 | 
275 | </code></pre>
276 | 
277 | <h3 id="set_checkpoint_load_function"><code>set_checkpoint_load_function</code></h3>
278 | <pre><code class="python">cluster.set_checkpoint_load_function(fx, kwargs)    
279 | </code></pre>
280 | 
281 | <p>Called internally when a job is auto-submitted by the SlurmCluster to give your program a chance to load the model weights or whatever you need to continue training.<br />
282 | This will call your load function immediately whenever you call this method AND training is continuing. </p>
283 | <ul>
284 | <li><code>fx</code> A python function.  </li>
285 | <li><code>kwargs</code> Dictionary where keys are the literal argument names to the function. Dictionary values are the values of the arguments.   </li>
286 | </ul>
287 | <p><strong>Example</strong></p>
288 | <pre><code class="python">def my_load_function(arg_1, arg_k):  
289 |     # ... restore my model here    
290 | 
291 | cluster.set_checkpoint_save_function(my_load_function, kwargs={'arg_1': 'whatever', 'arg_k': 'you_want'})    
292 | 
293 | </code></pre>
294 | 
295 | <h3 id="add_slurm_cmd"><code>add_slurm_cmd</code></h3>
296 | <pre><code class="python">cluster.add_slurm_cmd(cmd, value, comment)
297 | </code></pre>
298 | 
299 | <p>Adds whatever Slurm command you need manually to the generated script. All possible commands are listed <a href="https://slurm.schedmd.com/pdfs/summary.pdf">here</a>.</p>
300 | <ul>
301 | <li><code>cmd</code> String with the bash command.   </li>
302 | <li><code>value</code> String value for the command. Numericals need to be in single quotes <code>'1'</code>  </li>
303 | <li><code>comment</code> String with the command comment.  </li>
304 | </ul>
305 | <p><strong>Example</strong></p>
306 | <pre><code class="python">cluster.add_slurm_cmd(cmd='cpus-per-task', value='1', comment='nb cpus per task')
307 | 
308 | # the above command will add an entry like this to the slurm script   
309 | 
310 | # #nb cpus per task
311 | # #SBATCH --cpus-per-task=1
312 | # ############
313 | 
314 | </code></pre>
315 | 
316 | <h3 id="add_command"><code>add_command</code></h3>
317 | <pre><code class="python">cluster.add_command(cmd)    
318 | </code></pre>
319 | 
320 | <p>Adds arbitrary bash commands to the script. Use this to activate conda environments, install packages, whatever else you would think about calling on bash.    </p>
321 | <ul>
322 | <li><code>cmd</code> String with your bash command.   </li>
323 | </ul>
324 | <p><strong>Example</strong></p>
325 | <pre><code class="python"># load the anaconda package on the launch node   
326 | cluster.add_command('module load anaconda')   
327 | 
328 | # activate the environment on the launch node   
329 | cluster.add_command('source activate myCondaEnv')   
330 | </code></pre>
331 | 
332 | <h3 id="load_modules"><code>load_modules</code></h3>
333 | <pre><code class="python">cluster.load_modules(modules)  
334 | </code></pre>
335 | 
336 | <p>Loads modules needed to run the job. Your Slurm documentation should have a list of available modules. You can also get those by running <code>module avail</code>. <br />
337 | - <code>modules</code> Array of module names.    </p>
338 | <p><strong>Example</strong></p>
339 | <pre><code class="python">cluster.load_modules([
340 |     'python-3',
341 |     'anaconda3'
342 | ])   
343 | </code></pre>
344 | 
345 | <h3 id="notify_job_status"><code>notify_job_status</code></h3>
346 | <pre><code class="python">cluster.notify_job_status(email, on_done, on_fail)  
347 | </code></pre>
348 | 
349 | <p>Loads modules needed to run the job. Your Slurm documentation should have a list of available modules. You can also get those by running <code>module avail</code>.   </p>
350 | <ul>
351 | <li><code>email</code> String. Email address to get notifications.       </li>
352 | <li><code>on_done</code> Boolean. If true, you'll get an email when the job completes.      </li>
353 | <li><code>on_fail</code> Boolean. If true, you'll get an email if the job fails.    </li>
354 | </ul>
355 | <p><strong>Example</strong></p>
356 | <pre><code class="python">cluster.notify_job_status(email='some@email.com', on_done=True, on_fail=True)   
357 | </code></pre>
358 | 
359 | <h3 id="optimize_parallel_cluster_gpu"><code>optimize_parallel_cluster_gpu</code></h3>
360 | <pre><code class="python">cluster.optimize_parallel_cluster_gpu(train_function, nb_trials, job_name, job_display_name=None)  
361 | </code></pre>
362 | 
363 | <p>Launches the hyperparameter search across the cluster nodes.    <br />
364 | - <code>train_function</code> The entry point to start your training routine. <br />
365 | - <code>nb_trials</code> Number of trials to launch. This is the number of hyperparameter configurations to train over. <br />
366 | - <code>job_name</code> Folder name where the slurm scripts will save to. This should be the same as your <a href="https://williamfalcon.github.io/test-tube/experiment_tracking/experiment/">Experiment</a> name.    <br />
367 | - <code>job_display_name</code> Visible name when slurm lists running jobs (ie: through <code>squeue -u user_name</code>).  This should be really short (if using a test tube Experiment, it'll put the experiment version at the end).   </p>
368 | <p><strong>Example</strong></p>
369 | <pre><code class="python">def main(hparams, cluster, return_dict):   
370 |     # do your own generic training code here... 
371 |     # init model
372 |     model = model_build(hparams)    
373 | 
374 |     # set the load and save fxs
375 |     cluster.set_checkpoint_save_function(fx, {})
376 |     cluster.set_checkpoint_load_function(fx, {})
377 | 
378 |     # train ...
379 | 
380 | 
381 | cluster.optimize_parallel_cluster_gpu(main, nb_trials=20, job_name='my_job', job_display_name='mj')    
382 | </code></pre>
383 | 
384 | <p>Now if you get the job information, you'll see this:    </p>
385 | <pre><code class="bash">(conda_env) [user@node dir]$ squeue -u my_name
386 |              JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
387 |             104040       all  mjv0   my_name  R      58:22      1 nodeName
388 |             104041       all  mjv1   my_name  R      58:22      1 nodeName
389 |             104042       all  mjv2   my_name  R      58:22      1 nodeName
390 |             104043       all  mjv3   my_name  R      58:22      1 nodeName
391 | </code></pre>
392 | 
393 | <h3 id="optimize_parallel_cluster_cpu"><code>optimize_parallel_cluster_cpu</code></h3>
394 | <pre><code class="python">cluster.optimize_parallel_cluster_cpu(train_function, nb_trials, job_name, job_display_name=None)  
395 | </code></pre>
396 | 
397 | <p>Launches the hyperparameter search across the cluster nodes using cpus.    <br />
398 | - <code>train_function</code> The entry point to start your training routine. <br />
399 | - <code>nb_trials</code> Number of trials to launch. This is the number of hyperparameter configurations to train over. <br />
400 | - <code>job_name</code> Folder name where the slurm scripts will save to. This should be the same as your <a href="https://williamfalcon.github.io/test-tube/experiment_tracking/experiment/">Experiment</a> name.    <br />
401 | - <code>job_display_name</code> Visible name when slurm lists running jobs (ie: through <code>squeue -u user_name</code>).  This should be really short (if using a test tube Experiment, it'll put the experiment version at the end).   </p>
402 | <p><strong>Example</strong></p>
403 | <pre><code class="python">def main(hparams, cluster, return_dict):   
404 |     # do your own generic training code here... 
405 |     # init model
406 |     model = model_build(hparams)    
407 | 
408 |     # set the load and save fxs
409 |     cluster.set_checkpoint_save_function(fx, {})
410 |     cluster.set_checkpoint_load_function(fx, {})
411 | 
412 |     # train ...
413 | 
414 | 
415 | cluster.optimize_parallel_cluster_cpu(main, nb_trials=20, job_name='my_job', job_display_name='mj')    
416 | </code></pre>
417 | 
418 | <p>Now if you get the job information, you'll see this:    </p>
419 | <pre><code class="bash">(conda_env) [user@node dir]$ squeue -u my_name
420 |              JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
421 |             104040       all  mjv0   my_name  R      58:22      1 nodeName
422 |             104041       all  mjv1   my_name  R      58:22      1 nodeName
423 |             104042       all  mjv2   my_name  R      58:22      1 nodeName
424 |             104043       all  mjv3   my_name  R      58:22      1 nodeName
425 | </code></pre>
426 |               
427 |             </div>
428 |           </div>
429 |           <footer>
430 |   
431 |     <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
432 |       
433 |         <a href="../../hyperparameter_optimization/HyperOptArgumentParser/" class="btn btn-neutral float-right" title="HyperOptArgumentParser class API">Next <span class="icon icon-circle-arrow-right"></span></a>
434 |       
435 |       
436 |         <a href="../../experiment_tracking/experiment/" class="btn btn-neutral" title="Experiment class API"><span class="icon icon-circle-arrow-left"></span> Previous</a>
437 |       
438 |     </div>
439 |   
440 | 
441 |   <hr/>
442 | 
443 |   <div role="contentinfo">
444 |     <!-- Copyright etc -->
445 |     
446 |   </div>
447 | 
448 |   Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
449 | </footer>
450 |       
451 |         </div>
452 |       </div>
453 | 
454 |     </section>
455 | 
456 |   </div>
457 | 
458 |   <div class="rst-versions" role="note" style="cursor: pointer">
459 |     <span class="rst-current-version" data-toggle="rst-current-version">
460 |       
461 |           <a href="https://github.com/williamFalcon/test_tube/" class="fa fa-github" style="float: left; color: #fcfcfc"> GitHub</a>
462 |       
463 |       
464 |         <span><a href="../../experiment_tracking/experiment/" style="color: #fcfcfc;">&laquo; Previous</a></span>
465 |       
466 |       
467 |         <span style="margin-left: 15px"><a href="../../hyperparameter_optimization/HyperOptArgumentParser/" style="color: #fcfcfc">Next &raquo;</a></span>
468 |       
469 |     </span>
470 | </div>
471 |     <script>var base_url = '../..';</script>
472 |     <script src="../../js/theme.js" defer></script>
473 |       <script src="../../search/main.js" defer></script>
474 | 
475 | </body>
476 | </html>
477 | 


--------------------------------------------------------------------------------
/test_tube/log.py:
--------------------------------------------------------------------------------
  1 | import contextlib
  2 | import json
  3 | import os
  4 | import shutil
  5 | from datetime import datetime
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | from imageio import imwrite
 10 | from tensorboard.compat.proto.event_pb2 import Event
 11 | from tensorboard.compat.proto.event_pb2 import SessionLog
 12 | from torch.utils.tensorboard import SummaryWriter, FileWriter
 13 | 
 14 | # constants
 15 | _ROOT = os.path.abspath(os.path.dirname(__file__))
 16 | 
 17 | # -----------------------------
 18 | # Experiment object
 19 | # -----------------------------
 20 | 
 21 | 
 22 | class DDPExperiment(object):
 23 |     def __init__(
 24 |         self,
 25 |         exp
 26 |     ):
 27 |         """
 28 |         Used as meta_data storage if the experiment needs to be pickled
 29 |         :param name:
 30 |         :param debug:
 31 |         :param version:
 32 |         :param save_dir:
 33 |         :param autosave:
 34 |         :param description:
 35 |         :param create_git_tag:
 36 |         :param args:
 37 |         :param kwargs:
 38 |         """
 39 | 
 40 |         self.tag_markdown_saved = exp.tag_markdown_saved
 41 |         self.no_save_dir = exp.no_save_dir
 42 |         self.metrics = exp.metrics
 43 |         self.tags = exp.tags
 44 |         self.name = exp.name
 45 |         self.debug = exp.debug
 46 |         self.version = exp.version
 47 |         self.autosave = exp.autosave
 48 |         self.description = exp.description
 49 |         self.create_git_tag = exp.create_git_tag
 50 |         self.exp_hash = exp.exp_hash
 51 |         self.created_at = exp.created_at
 52 |         self.save_dir = exp.save_dir
 53 | 
 54 | 
 55 |     def get_non_ddp_exp(self):
 56 |         return Experiment(
 57 |             name=self.name,
 58 |             debug=self.debug,
 59 |             version=self.version,
 60 |             save_dir=self.save_dir,
 61 |             autosave=self.autosave,
 62 |             description=self.description,
 63 |             create_git_tag=self.create_git_tag
 64 |         )
 65 | 
 66 | class Experiment(SummaryWriter):
 67 | 
 68 |     def __init__(
 69 |         self,
 70 |         save_dir=None,
 71 |         name='default',
 72 |         debug=False,
 73 |         version=None,
 74 |         autosave=False,
 75 |         description=None,
 76 |         create_git_tag=False,
 77 |         rank=0,
 78 |         *args, **kwargs
 79 |     ):
 80 |         """
 81 |         A new Experiment object defaults to 'default' unless a specific name is provided
 82 |         If a known name is already provided, then the file version is changed
 83 |         :param name:
 84 |         :param debug:
 85 |         """
 86 | 
 87 |         # change where the save dir is if requested
 88 | 
 89 |         if save_dir is not None:
 90 |             global _ROOT
 91 |             _ROOT = save_dir
 92 | 
 93 |         self.save_dir = save_dir
 94 |         self.tag_markdown_saved = False
 95 |         self.no_save_dir = save_dir is None
 96 |         self.metrics = []
 97 |         self.tags = {}
 98 |         self.name = name
 99 |         self.debug = debug
100 |         self.version = version
101 |         self.autosave = autosave
102 |         self.description = description
103 |         self.create_git_tag = create_git_tag
104 |         self.exp_hash = '{}_v{}'.format(self.name, version)
105 |         self.created_at = str(datetime.utcnow())
106 |         self.rank = rank
107 |         self.process = os.getpid()
108 | 
109 |         # when debugging don't do anything else
110 |         if debug:
111 |             return
112 | 
113 |         # update version hash if we need to increase version on our own
114 |         # we will increase the previous version, so do it now so the hash
115 |         # is accurate
116 |         if version is None:
117 |             old_version = self.__get_last_experiment_version()
118 |             self.exp_hash = '{}_v{}'.format(self.name, old_version + 1)
119 |             self.version = old_version + 1
120 | 
121 |         # create a new log file
122 |         self.__init_cache_file_if_needed()
123 | 
124 |         # when we have a version, load it
125 |         if self.version is not None:
126 | 
127 |             # when no version and no file, create it
128 |             if not os.path.exists(self.__get_log_name()):
129 |                 self.__create_exp_file(self.version)
130 |             else:
131 |                 # otherwise load it
132 |                 try:
133 |                     self.__load()
134 |                 except Exception as e:
135 |                     self.debug = True
136 |         else:
137 |             # if no version given, increase the version to a new exp
138 |             # create the file if not exists
139 |             old_version = self.__get_last_experiment_version()
140 |             self.version = old_version
141 |             self.__create_exp_file(self.version + 1)
142 | 
143 |         # create a git tag if requested
144 |         if self.create_git_tag:
145 |             desc = description if description is not None else 'no description'
146 |             tag_msg = 'Test tube exp: {} - {}'.format(self.name, desc)
147 |             cmd = 'git tag -a tt_{} -m "{}"'.format(self.exp_hash, tag_msg)
148 |             os.system(cmd)
149 |             print('Test tube created git tag:', 'tt_{}'.format(self.exp_hash))
150 | 
151 |         # set the tensorboardx log path to the /tf folder in the exp folder
152 |         log_dir = self.get_tensorboardx_path(self.name, self.version)
153 |         # this is a fix for pytorch 1.1 since it does not have this attribute
154 |         for attr, val in [('purge_step', None),
155 |                           ('max_queue', 10),
156 |                           ('flush_secs', 120),
157 |                           ('filename_suffix', '')]:
158 |             if not hasattr(self, attr):
159 |                 setattr(self, attr, val)
160 |         super().__init__(log_dir=log_dir, *args, **kwargs)
161 | 
162 |         # register on exit fx so we always close the writer
163 |         # atexit.register(self.on_exit)
164 | 
165 |     def get_meta_copy(self):
166 |         """
167 |         Gets a meta-version only copy of this module
168 |         :return:
169 |         """
170 |         return DDPExperiment(self)
171 | 
172 |     def on_exit(self):
173 |         pass
174 | 
175 | 
176 |     def __clean_dir(self):
177 |         files = os.listdir(self.save_dir)
178 | 
179 |         if self.rank == 0:
180 |             return
181 | 
182 |         for f in files:
183 |             if str(self.process) in f:
184 |                 os.remove(os.path.join(self.save_dir, f))
185 | 
186 |     def argparse(self, argparser):
187 |         parsed = vars(argparser)
188 |         to_add = {}
189 | 
190 |         # don't store methods
191 |         for k, v in parsed.items():
192 |             if not callable(v):
193 |                 to_add[k] = v
194 | 
195 |         self.tag(to_add)
196 | 
197 |     def add_meta_from_hyperopt(self, hypo):
198 |         """
199 |         Transfers meta data about all the params from the
200 |         hyperoptimizer to the log
201 |         :param hypo:
202 |         :return:
203 |         """
204 |         meta = hypo.get_current_trial_meta()
205 |         for tag in meta:
206 |             self.tag(tag)
207 | 
208 |     # --------------------------------
209 |     # FILE IO UTILS
210 |     # --------------------------------
211 |     def __init_cache_file_if_needed(self):
212 |         """
213 |         Inits a file that we log historical experiments
214 |         :return:
215 |         """
216 |         try:
217 |             exp_cache_file = self.get_data_path(self.name, self.version)
218 |             if not os.path.isdir(exp_cache_file):
219 |                 os.makedirs(exp_cache_file, exist_ok=True)
220 |         except Exception as e:
221 |             # file already exists (likely written by another exp. In this case disable the experiment
222 |             self.debug = True
223 | 
224 |     def __create_exp_file(self, version):
225 |         """
226 |         Recreates the old file with this exp and version
227 |         :param version:
228 |         :return:
229 |         """
230 | 
231 |         try:
232 |             exp_cache_file = self.get_data_path(self.name, self.version)
233 |             # if no exp, then make it
234 |             path = '{}/meta.experiment'.format(exp_cache_file)
235 |             open(path, 'w').close()
236 |             self.version = version
237 | 
238 |             # make the directory for the experiment media assets name
239 |             os.makedirs(self.get_media_path(self.name, self.version), exist_ok=True)
240 | 
241 |             # make the directory for tensorboardx stuff
242 |             os.makedirs(self.get_tensorboardx_path(self.name, self.version), exist_ok=True)
243 |         except Exception as e:
244 |             # file already exists (likely written by another exp. In this case disable the experiment
245 |             self.debug = True
246 | 
247 | 
248 |     def __get_last_experiment_version(self):
249 |         try:
250 |             exp_cache_file = os.sep.join(self.get_data_path(self.name, self.version).split(os.sep)[:-1])
251 |             return find_last_experiment_version(exp_cache_file)
252 |         except Exception as e:
253 |             return -1
254 | 
255 |     def __get_log_name(self):
256 |         exp_cache_file = self.get_data_path(self.name, self.version)
257 |         return '{}/meta.experiment'.format(exp_cache_file)
258 | 
259 |     def tag(self, tag_dict):
260 |         """
261 |         Adds a tag to the experiment.
262 |         Tags are metadata for the exp.
263 | 
264 |         >> e.tag({"model": "Convnet A"})
265 | 
266 |         :param key:
267 |         :param val:
268 |         :return:
269 |         """
270 |         if self.debug or self.rank > 0: return
271 | 
272 |         # parse tags
273 |         for k, v in tag_dict.items():
274 |             self.tags[k] = v
275 | 
276 |         # save if needed
277 |         if self.autosave == True:
278 |             self.save()
279 | 
280 |     def log(self, metrics_dict, global_step=None, walltime=None):
281 |         """
282 |         Adds a json dict of metrics.
283 | 
284 |         >> e.log({"loss": 23, "coeff_a": 0.2})
285 | 
286 |         :param metrics_dict:
287 |         :tag optional tfx tag
288 |         :return:
289 |         """
290 |         if self.debug or self.rank > 0: return
291 | 
292 |         # handle tfx metrics
293 |         if global_step is None:
294 |             global_step = len(self.metrics)
295 | 
296 |         new_metrics_dict = metrics_dict.copy()
297 |         for k, v in metrics_dict.items():
298 |             if isinstance(v, dict):
299 |                 self.add_scalars(main_tag=k, tag_scalar_dict=v, global_step=global_step, walltime=walltime)
300 |                 tmp_metrics_dict = new_metrics_dict.pop(k)
301 |                 new_metrics_dict.update(tmp_metrics_dict)
302 |             else:
303 |                 self.add_scalar(tag=k, scalar_value=v, global_step=global_step, walltime=walltime)
304 | 
305 |         metrics_dict = new_metrics_dict
306 | 
307 |         # timestamp
308 |         if 'created_at' not in metrics_dict:
309 |             metrics_dict['created_at'] = str(datetime.utcnow())
310 | 
311 |         self.__convert_numpy_types(metrics_dict)
312 | 
313 |         self.metrics.append(metrics_dict)
314 | 
315 |         if self.autosave:
316 |             self.save()
317 | 
318 |     def __convert_numpy_types(self, metrics_dict):
319 |         for k, v in metrics_dict.items():
320 |             if v.__class__.__name__ == 'float32':
321 |                 metrics_dict[k] = float(v)
322 | 
323 |             if v.__class__.__name__ == 'float64':
324 |                 metrics_dict[k] = float(v)
325 | 
326 |     def save(self):
327 |         """
328 |         Saves current experiment progress
329 |         :return:
330 |         """
331 |         if self.debug or self.rank > 0: return
332 | 
333 |         # save images and replace the image array with the
334 |         # file name
335 |         self.__save_images(self.metrics)
336 |         metrics_file_path = self.get_data_path(self.name, self.version) + '/metrics.csv'
337 |         meta_tags_path = self.get_data_path(self.name, self.version) + '/meta_tags.csv'
338 | 
339 |         obj = {
340 |             'name': self.name,
341 |             'version': self.version,
342 |             'tags_path': meta_tags_path,
343 |             'metrics_path': metrics_file_path,
344 |             'autosave': self.autosave,
345 |             'description': self.description,
346 |             'created_at': self.created_at,
347 |             'exp_hash': self.exp_hash
348 |         }
349 | 
350 |         # save the experiment meta file
351 |         with atomic_write(self.__get_log_name()) as tmp_path:
352 |             with open(tmp_path, 'w') as file:
353 |                 json.dump(obj, file, ensure_ascii=False)
354 | 
355 |         # save the metatags file
356 |         df = pd.DataFrame({'key': list(self.tags.keys()), 'value': list(self.tags.values())})
357 |         with atomic_write(meta_tags_path) as tmp_path:
358 |             df.to_csv(tmp_path, index=False)
359 | 
360 |         # save the metrics data
361 |         df = pd.DataFrame(self.metrics)
362 |         with atomic_write(metrics_file_path) as tmp_path:
363 |             df.to_csv(tmp_path, index=False)
364 | 
365 |         # write new vals to disk
366 |         self.flush()
367 | 
368 |         # until hparam plugin is fixed, generate hparams as text
369 |         if not self.tag_markdown_saved and len(self.tags) > 0:
370 |             self.tag_markdown_saved = True
371 |             self.add_text('hparams', self.__generate_tfx_meta_log())
372 | 
373 |     def __generate_tfx_meta_log(self):
374 |         header = f'''###### {self.name}, version {self.version}\n---\n'''
375 |         desc = ''
376 |         if self.description is not None:
377 |             desc = f'''#####*{self.description}*\n'''
378 |         params = f'''##### Hyperparameters\n'''
379 | 
380 |         row_header = '''parameter|value\n-|-\n'''
381 |         rows = [row_header]
382 |         for k, v in self.tags.items():
383 |             row = f'''{k}|{v}\n'''
384 |             rows.append(row)
385 | 
386 |         all_rows = [
387 |             header,
388 |             desc,
389 |             params
390 |         ]
391 |         all_rows.extend(rows)
392 |         mkdown_log = ''.join(all_rows)
393 |         return mkdown_log
394 | 
395 |     def __save_images(self, metrics):
396 |         """
397 |         Save tags that have a png_ prefix (as images)
398 |         and replace the meta tag with the file name
399 |         :param metrics:
400 |         :return:
401 |         """
402 |         # iterate all metrics and find keys with a specific prefix
403 |         for i, metric in enumerate(metrics):
404 |             for k, v in metric.items():
405 |                 # if the prefix is a png, save the image and replace the value with the path
406 |                 img_extension = None
407 |                 img_extension = 'png' if 'png_' in k else img_extension
408 |                 img_extension = 'jpg' if 'jpg' in k else img_extension
409 |                 img_extension = 'jpeg' if 'jpeg' in k else img_extension
410 | 
411 |                 if img_extension is not None:
412 |                     # determine the file name
413 |                     img_name = '_'.join(k.split('_')[1:])
414 |                     save_path = self.get_media_path(self.name, self.version)
415 |                     save_path = '{}/{}_{}.{}'.format(save_path, img_name, i, img_extension)
416 | 
417 |                     # save image to disk
418 |                     if type(metric[k]) is not str:
419 |                         imwrite(save_path, metric[k])
420 | 
421 |                     # replace the image in the metric with the file path
422 |                     metric[k] = save_path
423 | 
424 |     def __load(self):
425 |         # load .experiment file
426 |         with open(self.__get_log_name(), 'r') as file:
427 |             data = json.load(file)
428 |             self.name = data['name']
429 |             self.version = data['version']
430 |             self.autosave = data['autosave']
431 |             self.created_at = data['created_at']
432 |             self.description = data['description']
433 |             self.exp_hash = data['exp_hash']
434 | 
435 |         # load .tags file
436 |         meta_tags_path = self.get_data_path(self.name, self.version) + '/meta_tags.csv'
437 |         df = pd.read_csv(meta_tags_path)
438 |         self.tags_list = df.to_dict(orient='records')
439 |         self.tags = {}
440 |         for d in self.tags_list:
441 |             k, v = d['key'], d['value']
442 |             self.tags[k] = v
443 | 
444 |         # load metrics
445 |         metrics_file_path = self.get_data_path(self.name, self.version) + '/metrics.csv'
446 |         try:
447 |             df = pd.read_csv(metrics_file_path)
448 |             self.metrics = df.to_dict(orient='records')
449 | 
450 |             # remove nans
451 |             for metric in self.metrics:
452 |                 to_delete = []
453 |                 for k, v in metric.items():
454 |                     try:
455 |                         if np.isnan(v):
456 |                             to_delete.append(k)
457 |                     except Exception as e:
458 |                         pass
459 | 
460 |                 for k in to_delete:
461 |                     del metric[k]
462 |         except Exception as e:
463 |             # metrics was empty...
464 |             self.metrics = []
465 | 
466 |     def get_data_path(self, exp_name, exp_version):
467 |         """
468 |         Returns the path to the local package cache
469 |         :param path:
470 |         :return:
471 |         """
472 |         if self.no_save_dir:
473 |             return os.path.join(_ROOT, 'test_tube_data', exp_name, 'version_{}'.format(exp_version))
474 |         else:
475 |             return os.path.join(_ROOT, exp_name, 'version_{}'.format(exp_version))
476 | 
477 |     def get_media_path(self, exp_name, exp_version):
478 |         """
479 |         Returns the path to the local package cache
480 |         :param path:
481 |         :return:
482 |         """
483 |         return os.path.join(self.get_data_path(exp_name, exp_version), 'media')
484 | 
485 |     def get_tensorboardx_path(self, exp_name, exp_version):
486 |         """
487 |         Returns the path to the local package cache
488 |         :param path:
489 |         :return:
490 |         """
491 |         return os.path.join(self.get_data_path(exp_name, exp_version), 'tf')
492 | 
493 |     def get_tensorboardx_scalars_path(self, exp_name, exp_version):
494 |         """
495 |         Returns the path to the local package cache
496 |         :param path:
497 |         :return:
498 |         """
499 |         tfx_path = self.get_tensorboardx_path(exp_name, exp_version)
500 |         return os.path.join(tfx_path, 'scalars.json')
501 | 
502 | 
503 |     # ----------------------------
504 |     # OVERWRITES
505 |     # ----------------------------
506 |     def _get_file_writer(self):
507 |         """Returns the default FileWriter instance. Recreates it if closed."""
508 |         if self.rank > 0:
509 |             return TTDummyFileWriter()
510 | 
511 |         if self.all_writers is None or self.file_writer is None:
512 |             if self.purge_step is not None:
513 |                 most_recent_step = self.purge_step
514 |                 self.file_writer = FileWriter(self.log_dir, self.max_queue,
515 |                                           self.flush_secs, self.filename_suffix)
516 |                 self.file_writer.debug = self.debug
517 |                 self.file_writer.rank = self.rank
518 | 
519 |                 self.file_writer.add_event(
520 |                     Event(step=most_recent_step, file_version='brain.Event:2'))
521 |                 self.file_writer.add_event(
522 |                     Event(step=most_recent_step, session_log=SessionLog(status=SessionLog.START)))
523 |             else:
524 |                 self.file_writer = FileWriter(self.log_dir, self.max_queue,
525 |                                           self.flush_secs, self.filename_suffix)
526 |             self.all_writers = {self.file_writer.get_logdir(): self.file_writer}
527 |         return self.file_writer
528 | 
529 | 
530 |     def __str__(self):
531 |         return 'Exp: {}, v: {}'.format(self.name, self.version)
532 | 
533 |     def __hash__(self):
534 |         return 'Exp: {}, v: {}'.format(self.name, self.version)
535 | 
536 |     def flush(self):
537 |         if self.rank > 0:
538 |             return
539 | 
540 |         if self.all_writers is None:
541 |             return  # ignore double close
542 | 
543 |         for writer in self.all_writers.values():
544 |             writer.flush()
545 | 
546 | 
547 | class TTDummyFileWriter(object):
548 | 
549 |     def add_summary(self, summary, global_step=None, walltime=None):
550 |         """
551 |         Overwrite tf add summary so we can ignore when other non-zero processes call it
552 |         Avoids overwriting logs from multiple processes
553 |         :param summary:
554 |         :param global_step:
555 |         :param walltime:
556 |         :return:
557 |         """
558 |         return
559 | 
560 | 
561 | @contextlib.contextmanager
562 | def atomic_write(dst_path):
563 |     """A context manager to simplify atomic writing.
564 | 
565 |     Usage:
566 |     >>> with atomic_write(dst_path) as tmp_path:
567 |     >>>     # write to tmp_path
568 |     >>> # Here tmp_path renamed to dst_path, if no exception happened.
569 |     """
570 |     tmp_path = str(dst_path) + '.tmp'
571 |     try:
572 |         yield tmp_path
573 |     except:
574 |         if os.path.exists(tmp_path):
575 |             os.remove(tmp_path)
576 |         raise
577 |     else:
578 |         # If everything is fine, move tmp file to the destination.
579 |         shutil.move(tmp_path, str(dst_path))
580 | 
581 | 
582 | def find_last_experiment_version(path):
583 |     last_version = -1
584 |     for f in os.listdir(path):
585 |         if 'version_' in f:
586 |             file_parts = f.split('_')
587 |             version = int(file_parts[-1])
588 |             last_version = max(last_version, version)
589 |     return last_version
590 | 
591 | 
592 | if __name__ == '__main__':
593 |     from time import sleep
594 |     e = Experiment(description='my description')
595 |     e.tag({'lr': 0.02, 'layers': 4})
596 | 
597 |     for n_iter in range(20):
598 |         sleep(0.3)
599 |         e.log({'loss/xsinx': n_iter * np.sin(n_iter)})
600 |         if n_iter % 10 == 0:
601 |             print('saved')
602 |             e.save()
603 | 
604 |     e.close()
605 |     os._exit(1)
606 | 
607 | 


--------------------------------------------------------------------------------