├── aimlutils
    ├── __init__.py
    ├── data
    │   ├── __init__.py
    │   └── splitting.py
    ├── echo
    │   ├── __init__.py
    │   ├── src
    │   │   ├── __init__.py
    │   │   ├── pruners.py
    │   │   ├── trial_suggest.py
    │   │   ├── samplers.py
    │   │   └── base_objective.py
    │   ├── examples
    │   │   ├── keras
    │   │   │   ├── launch.sh
    │   │   │   ├── model_config.yml
    │   │   │   ├── hyperparameter.yml
    │   │   │   ├── objective.py
    │   │   │   ├── data_generator.py
    │   │   │   └── model.py
    │   │   └── torch
    │   │   │   ├── model.yml
    │   │   │   ├── hyperparameter.yml
    │   │   │   └── objective.py
    │   ├── report.py
    │   ├── run.py
    │   ├── README.ipynb
    │   ├── README.md
    │   └── optimize.py
    ├── torch
    │   ├── __init__.py
    │   ├── losses
    │   │   ├── __init__.py
    │   │   └── losses.py
    │   ├── models
    │   │   └── __init__.py
    │   ├── trainers
    │   │   ├── __init__.py
    │   │   └── trainers.py
    │   ├── checkpoint
    │   │   ├── __init__.py
    │   │   └── checkpointer.py
    │   └── optimizers
    │   │   ├── __init__.py
    │   │   └── optimizers.py
    └── utils
    │   ├── __init__.py
    │   ├── tqdm.py
    │   └── gpu.py
├── __version__.py
├── blog
    ├── site
    │   ├── generators
    │   │   └── test_data.pkl
    │   ├── memory_images
    │   │   ├── complex.png
    │   │   ├── test.py.png
    │   │   ├── complex_plot.png
    │   │   ├── example.py.png
    │   │   ├── example.py_m.png
    │   │   ├── mprofile.dat.png
    │   │   ├── mprof_run_plot.png
    │   │   ├── test.py_output.png
    │   │   ├── example.py_output.png
    │   │   └── example.py_m_output.png
    │   ├── _toc.yml
    │   ├── home.md
    │   ├── howto.md
    │   ├── _config.yml
    │   ├── memory.ipynb
    │   ├── optuna_mariadb.ipynb
    │   ├── generators.ipynb
    │   └── slurm.ipynb
    ├── build.sh
    ├── publish.sh
    └── NCAR_UCAR_LIVERY
    │   └── Logos
    │       └── Contemporary Logos
    │           └── NCAR
    │               └── Dark Logo
    │                   └── NCAR-contemp-logo-blue.png
├── .gitignore
├── requirements.txt
├── README.md
└── setup.py


/aimlutils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/aimlutils/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/aimlutils/echo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/aimlutils/torch/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/aimlutils/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/aimlutils/echo/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/__version__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.0.1'


--------------------------------------------------------------------------------
/aimlutils/torch/losses/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/aimlutils/torch/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/aimlutils/torch/trainers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/aimlutils/torch/checkpoint/__init__.py:
--------------------------------------------------------------------------------
1 | from aimlutils.torch.checkpoint.checkpointer import *
2 | 


--------------------------------------------------------------------------------
/aimlutils/torch/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from aimlutils.torch.optimizers.optimizers import *
2 | 


--------------------------------------------------------------------------------
/blog/site/generators/test_data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/generators/test_data.pkl


--------------------------------------------------------------------------------
/blog/site/memory_images/complex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/memory_images/complex.png


--------------------------------------------------------------------------------
/blog/site/memory_images/test.py.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/memory_images/test.py.png


--------------------------------------------------------------------------------
/blog/site/memory_images/complex_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/memory_images/complex_plot.png


--------------------------------------------------------------------------------
/blog/site/memory_images/example.py.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/memory_images/example.py.png


--------------------------------------------------------------------------------
/blog/site/memory_images/example.py_m.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/memory_images/example.py_m.png


--------------------------------------------------------------------------------
/blog/site/memory_images/mprofile.dat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/memory_images/mprofile.dat.png


--------------------------------------------------------------------------------
/blog/site/memory_images/mprof_run_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/memory_images/mprof_run_plot.png


--------------------------------------------------------------------------------
/blog/site/memory_images/test.py_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/memory_images/test.py_output.png


--------------------------------------------------------------------------------
/blog/site/memory_images/example.py_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/memory_images/example.py_output.png


--------------------------------------------------------------------------------
/blog/site/memory_images/example.py_m_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/memory_images/example.py_m_output.png


--------------------------------------------------------------------------------
/blog/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Clean out the current build
4 | jupyter-book clean site/_build
5 | # Build the site with jupyer-book
6 | jupyter-book build site
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | blog/site/_build/*
2 | .ipynb_checkpoints/*
3 | */.ipynb_checkpoints/*
4 | */*/.ipynb_checkpoints/*
5 | */*/*/.ipynb_checkpoints/*
6 | blog/NCAR_UCAR_LIVERY/*
7 | blog/.DS_Store


--------------------------------------------------------------------------------
/blog/publish.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Publish the site on GitHub
4 | ghp-import -n -p -f site/_build/html
5 | 
6 | # Print the site domain address for convenience
7 | echo "https://ncar.github.io/aiml-utils/home.html"
8 | 


--------------------------------------------------------------------------------
/blog/site/_toc.yml:
--------------------------------------------------------------------------------
1 | - file: home
2 | - file: howto
3 | - file: optuna_mariadb.ipynb
4 | - file: slurm.ipynb 
5 | - file: memory.ipynb
6 | - file: callbacks.ipynb
7 | - file: data_loaders.ipynb
8 | - file: generators.ipynb


--------------------------------------------------------------------------------
/blog/NCAR_UCAR_LIVERY/Logos/Contemporary Logos/NCAR/Dark Logo/NCAR-contemp-logo-blue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/NCAR_UCAR_LIVERY/Logos/Contemporary Logos/NCAR/Dark Logo/NCAR-contemp-logo-blue.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | pandas
 3 | optuna
 4 | matplotlib
 5 | tensorflow
 6 | torch
 7 | torchvision
 8 | pyyaml
 9 | scipy
10 | xarray
11 | netcdf4
12 | jupyter
13 | jupyter-book
14 | ghp-import
15 | sphinxcontrib-bibtex<2.0.0
16 | 


--------------------------------------------------------------------------------
/aimlutils/utils/tqdm.py:
--------------------------------------------------------------------------------
1 | from tqdm import tqdm as tqdm_base
2 | 
3 | def tqdm(*args, **kwargs):
4 |     if hasattr(tqdm_base, '_instances'):
5 |         for instance in list(tqdm_base._instances):
6 |             tqdm_base._decr_instances(instance)
7 |     return tqdm_base(*args, **kwargs)
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # aiml-utils
2 | This repository contains:
3 | 
4 | - The AIML group's jupyter-books blog website (blog/). [See also here](https://ncar.github.io/aiml-utils/home.html).
5 | 
6 | - **E**arth **C**omputing **H**yperparameter **O**ptimization, ECHO (aimlutils/echo)
7 | 
8 | - Utilities that are shared across different projects (aimlutils/[utils,data,torch])


--------------------------------------------------------------------------------
/aimlutils/echo/examples/keras/launch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --account=NAML0001
 3 | #SBATCH --gres=gpu:v100:1
 4 | #SBATCH --mem=128G
 5 | #SBATCH -n 8
 6 | #SBATCH -t 12:00:00
 7 | #SBATCH -J hyper_opt
 8 | #SBATCH -o hyper_opt.out
 9 | #SBATCH -e hyper_opt.err
10 | module load ncarenv/1.3 gnu/8.3.0 openmpi/3.1.4 python/3.7.5 cuda/10.1
11 | ncar_pylib
12 | python run.py examples/keras/hyperparameter.yml examples/keras/model_config.yml
13 | 


--------------------------------------------------------------------------------
/aimlutils/utils/gpu.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | 
 3 | def gpu_report():
 4 |     """Get the current gpu usage.
 5 | 
 6 |     Returns
 7 |     -------
 8 |     usage: dict
 9 |         Keys are device ids as integers.
10 |         Values are memory usage as integers in MB.
11 |     """
12 |     cmd = ['nvidia-smi', '--query-gpu=memory.free','--format=csv,nounits,noheader']
13 |     result = subprocess.check_output(cmd)
14 |     result = result.decode('utf-8')
15 |     # Convert lines into a dictionary
16 |     gpu_memory = [int(x) for x in result.strip().split('\n')]
17 |     gpu_memory_map = dict(zip(range(len(gpu_memory)), gpu_memory))
18 |     return gpu_memory_map
19 | 


--------------------------------------------------------------------------------
/aimlutils/echo/src/pruners.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | 
 4 | import sys
 5 | import optuna
 6 | import logging
 7 | from tensorflow.keras.callbacks import Callback
 8 | 
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | class KerasPruningCallback(Callback):
14 | 
15 |     def __init__(self, trial, monitor, interval = 1):
16 |         # type: (optuna.trial.Trial, str) -> None
17 | 
18 |         super(KerasPruningCallback, self).__init__()
19 | 
20 |         self.trial = trial
21 |         self.monitor = monitor
22 |         self.interval = interval
23 | 
24 |     def on_epoch_end(self, epoch, logs=None):
25 |         # type: (int, Dict[str, float]) -> None
26 | 
27 |         logs = logs or {}
28 |         current_score = logs.get(self.monitor)
29 |         if current_score is None:
30 |             return
31 |         self.trial.report(current_score, step=epoch)
32 |         if self.trial.should_prune():
33 |             message = "Trial was pruned at epoch {}.".format(epoch)
34 |             raise optuna.structs.TrialPruned(message)
35 | 


--------------------------------------------------------------------------------
/aimlutils/echo/examples/keras/model_config.yml:
--------------------------------------------------------------------------------
 1 | path_data: "/glade/p/cisl/aiml/ai4ess_hackathon/holodec/"
 2 | path_save: "examples/keras/results"
 3 | model_name: "cnn"
 4 | num_particles: 3
 5 | random_seed: 328942
 6 | output_cols: ["x", "y", "z", "d"]
 7 | scaler_out: "StandardScaler"
 8 | num_z_bins: 100
 9 | subset: False
10 | conv2d_network:
11 |   filters: [8, 12, 16]
12 |   kernel_sizes: [5, 5, 5]
13 |   conv2d_activation: "leakyrelu"
14 |   pool_sizes: [5, 5, 5]
15 |   pool_dropout: 0.5
16 |   dense_sizes: [100, 50]
17 |   dense_dropout: 0.5
18 |   dense_activation: "leakyrelu"
19 |   output_activation: "linear"
20 |   lr: 0.0001
21 |   optimizer: "adam"
22 |   loss: "mse"
23 |   metrics: ["mse", "mae"]
24 |   batch_size: 128
25 |   epochs: 100
26 |   verbose: 1
27 | callbacks:
28 |   ModelCheckpoint:
29 |     monitor: "val_loss"
30 |     filepath: "examples/keras/results/model.h5"
31 |     save_best_only: True
32 |     save_weights_only: True
33 |   EarlyStopping:
34 |     monitor: "val_loss"
35 |     patience: 4
36 |   ReduceLROnPlateau: 
37 |     monitor: "val_loss"
38 |     factor: 0.2
39 |     patience: 1
40 |     min_lr: 0.0000001
41 |     mode: "auto"
42 |   CSVLogger:
43 |     filename: "examples/keras/results/training.txt"
44 |     separator: " "
45 |     append: True
46 | 


--------------------------------------------------------------------------------
/aimlutils/echo/src/trial_suggest.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | 
 4 | import sys
 5 | import optuna
 6 | import logging
 7 | 
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | supported_trials = [
13 |     "categorical",
14 |     "discrete_uniform",
15 |     "float",
16 |     "int",
17 |     "loguniform",
18 |     "uniform"
19 | ]
20 | 
21 | 
22 | def trial_suggest_loader(trial, config):
23 |     
24 |     try:
25 |         _type = config["type"]
26 |         if _type == "categorical":
27 |             return trial.suggest_categorical(**config["settings"])
28 |         elif _type == "discrete_uniform":
29 |             return int(trial.suggest_discrete_uniform(**config["settings"]))
30 |         elif _type == "float":
31 |             return float(trial.suggest_float(**config["settings"]))
32 |         elif _type == "int":
33 |             return int(trial.suggest_int(**config["settings"]))
34 |         elif _type == "loguniform":
35 |             return float(trial.suggest_loguniform(**config["settings"]))
36 |         elif _type == "uniform":
37 |             return float(trial.suggest_uniform(**config["settings"]))
38 |         else: #if _type not in supported_trials:
39 |             message = f"Type {_type} is not valid. Select from {supported_trials}"
40 |             logger.warning(message)
41 |             raise OSError(message)
42 |     except Exception as E:
43 |         print("FAILED IN TRIAL SUGGEST", E, config)
44 |         raise OSError


--------------------------------------------------------------------------------
/aimlutils/echo/examples/torch/model.yml:
--------------------------------------------------------------------------------
 1 | log: 'examples/torch/test'
 2 | type: "encoder-vae"
 3 | 
 4 | data:
 5 |   path_data: "/glade/scratch/schreck/holodec/"
 6 |   num_particles: "50-100"
 7 |   maxnum_particles: 100
 8 |   output_cols: ["x", "y", "z", "d", "binary"]
 9 |   subset: False
10 |   
11 | transforms:
12 |   #RandomVerticalFlip: False
13 |   #RandomHorizontalFlip: False
14 |   Rescale: 600
15 |   Normalize: 'norm'
16 |   ToTensor: True
17 | 
18 | iterator:
19 |   num_workers: 8
20 |   batch_size: 32
21 |   pin_memory: True
22 |   shuffle: True
23 |   
24 | model:
25 |   image_channels: 1
26 |   hidden_dims: [3, 94, 141, 471, 425, 1122]
27 |   z_dim: 1277
28 |   dense_hidden_dims: [1000]
29 |   dense_dropouts: [0.0]
30 |   tasks: ["x", "y", "z", "d", "binary"]
31 |   pretrained_model: "/glade/work/schreck/repos/holodec-ml/scripts/schreck/compressor/pretrained/pretrained.pt"
32 | 
33 | optimizer:
34 |   type: "lookahead-diffgrad"
35 |   lr: 0.000631
36 |   weight_decay: 0.0
37 | 
38 | callbacks:
39 |   MetricsLogger:
40 |     path_save: "test"
41 |     reload: False
42 |   EarlyStopping:
43 |     patience: 5
44 |     verbose: True
45 |     path_save: "examples/torch/test/checkpoint.pt"
46 |   ExponentialLR:
47 |     gamma: 0.95
48 |     
49 | #   ReduceLROnPlateau: 
50 | #     mode: "min"
51 | #     factor: 0.2
52 | #     patience: 1
53 | #     min_lr: 0.0000000001
54 | #     verbose: True
55 | 
56 | trainer:
57 |   start_epoch: 0
58 |   epochs: 1
59 |   clip: 1.0
60 |   alpha: 1.0
61 |   beta: 0.1
62 |   path_save: "examples/torch/test"
63 |   test_image: "examples/torch/test/image_600.pkl"


--------------------------------------------------------------------------------
/aimlutils/data/splitting.py:
--------------------------------------------------------------------------------
 1 | from sklearn.model_selection import train_test_split as _train_test_split
 2 | from typing import List
 3 | import pandas as pd
 4 | import numpy as np
 5 | 
 6 | # To do: Add documentation, a logger and verbose options
 7 | 
 8 | def stratified_split(df: pd.DataFrame,
 9 |                      frac: float,
10 |                      column: List[str]) -> (pd.DataFrame, pd.DataFrame):
11 |     
12 |     label_count = df[column].value_counts().to_dict()
13 |     labels_we_can_use = df[column].apply(lambda x: label_count[x] > 1)
14 |     items_with_count_one = df[~labels_we_can_use].copy()
15 |     items_needing_split = df[labels_we_can_use].copy()
16 |     
17 |     train, test = _train_test_split(
18 |         items_needing_split,
19 |         test_size=frac,
20 |         stratify=items_needing_split[column]
21 |     )
22 |     train = pd.concat([train, items_with_count_one], axis = 0, sort = True)#.reset_index(drop = True)
23 |     return train, test
24 | 
25 | 
26 | def train_test_split(df: pd.DataFrame,
27 |                      fraction: float = 0.2) -> (pd.DataFrame, pd.DataFrame):
28 |     
29 |     fraction = min(1.0, fraction)
30 |     train, test = stratified_split(df, fraction, "label")     
31 |     return train, test
32 | 
33 | 
34 | def train_test_val_split(df: pd.DataFrame,
35 |                          fraction: float = 0.2) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
36 |     
37 |     fraction = min(1.0, fraction)
38 |     train, _test = stratified_split(df, fraction, "label") 
39 |     test, val = stratified_split(_test, 0.5, "label") 
40 |     
41 |     return train, test, val


--------------------------------------------------------------------------------
/blog/site/home.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | Welcome to the AIML group's blog page! 
 4 | 
 5 | The goal of this site is to help current and future members of the AIML group at NCAR share programming lessons and tips. 
 6 | 
 7 | # About Us
 8 | The NCAR Analytics and Integrative Machine Learning group develops machine elearning systems to improve our understanding and prediction of the many facets of Earth's systems. Our group works in close collaboration with domain scientists across NCAR's labs, the university community, and the private sector to develop and integrate machine learning tools into modeling and observation pipelines. Our group members are:
 9 | 
10 | * David John Gagne, Machine Learning Scientist II (CISL/RAL)
11 | * John Schreck, Machine Learning Scientist I (CISL)
12 | * Charlie Becker, Associate Scientist II (CISL)
13 | * Gabrielle Gantos, Associate Scientist II (CISL)
14 | * Maria Molina, Project Scientist I (CGD)
15 | * Zhonghua Zheng, Postdoctoral Fellow (CISL/CGD)
16 | * Will Chapman, Postdoctoral Fellow (CGD/CISL/RAL)
17 | * Mariana Cains, Postdoctoral Fellow (MMM)
18 | * Chris Wirz, Postdoctoral Fellow (MMM)
19 | * Keely Lawrence, Student Assistant III (CISL, University of Colorado)
20 | * Prahalath Bharathi, Student Assistant (CISL, Tufts University)
21 | 
22 | # Sources
23 | Each blog published here is a Jupyter notebook, and you can download them from the [aiml_utils](https://github.com/NCAR/aiml-utils/tree/master/blog/site) repository hosted on the [NCAR GitHub group](https://github.com/NCAR) page.
24 | 
25 | Please head over to the [How To](https://ncar.github.io/aiml-utils/howto.html) page for instructions on how to add a blog to this website.
26 | 
27 | # License
28 | This work is licensed under a GNU General Public License v3.0
29 | 
30 | <!-- ## Notes
31 | 
32 | ```{note}
33 | This site is under construction!
34 | ``` -->
35 | 


--------------------------------------------------------------------------------
/aimlutils/echo/src/samplers.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | 
 4 | import sys
 5 | import optuna
 6 | import logging
 7 | 
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | supported_samplers = [
13 |     "TPESampler",
14 |     "GridSampler",
15 |     "RandomSampler",
16 |     "CmaEsSampler",
17 |     "IntersectionSearchSpace",
18 |     "MOTPEMultiObjectiveSampler",
19 |     "NSGAIIMultiObjectiveSampler",
20 |     "RandomMultiObjectiveSampler"
21 | ]
22 | 
23 | 
24 | def samplers(sampler):
25 |     _type = sampler.pop("type")
26 |     if _type not in supported_samplers:
27 |         message = f"Sampler {_type} is not valid. Select from {supported_samplers}"
28 |         logger.warning(message)
29 |         raise OSError(message)
30 |     if _type == "TPESampler":
31 |         return optuna.samplers.TPESampler(**sampler)
32 |     elif _type == "GridSampler":
33 |         if "search_space" not in sampler:
34 |             raise OSError("You must provide search_space options with the GridSampler.")
35 |         else:
36 |             return optuna.samplers.GridSampler(**sampler)
37 |     elif _type == "RandomSampler":
38 |         return optuna.samplers.RandomSampler(**sampler)
39 |     elif _type == "CmaEsSampler":
40 |         return optuna.integration.CmaEsSampler(**sampler)
41 |     elif _type == "IntersectionSearchSpace":
42 |         return optuna.integration.IntersectionSearchSpace(**sampler)
43 |     # support for multi-objective studies
44 |     elif _type == "MOTPEMultiObjectiveSampler":
45 |         return optuna.multi_objective.samplers.MOTPEMultiObjectiveSampler(**sampler)
46 |     elif _type == "NSGAIIMultiObjectiveSampler":
47 |         return optuna.multi_objective.samplers.NSGAIIMultiObjectiveSampler(**sampler)
48 |     elif _type == "RandomMultiObjectiveSampler":
49 |         return optuna.multi_objective.samplers.RandomMultiObjectiveSampler(**sampler)


--------------------------------------------------------------------------------
/blog/site/howto.md:
--------------------------------------------------------------------------------
 1 | # Add a jupyter notebook as a blog
 2 | 
 3 | 0. Install [aiml-utils](https://github.com/NCAR/aiml-utils/tree/master/aimlutils), or if already installed do a `git pull` with master before proceeding. Then switch to your branch (or create a new branch).
 4 | 
 5 | 1. Write your notebook, which we will assume is called `test_blog.ipynb`, with comments in Markdown. Be sure to check the header number of the most recently published blog, and set yours to be +1 that number (for example, the very first line in the callbacks blog has the title: 3. Callbacks: Utilities for interacting with ML training). If your blog comes next, make sure the first line is (in Markdown): #4. Your blogs title.
 6 | 
 7 | 2. Save your notebook in the directory `blog/site`. If any additional data or files accompany your blog, save those details in a separate file in `blog/site`.
 8 | 
 9 | 3. Add your blog as the last entry to the registry `blog/site/_toc.yml`. For example: `- file: test_blog.ipynb`
10 | 
11 | 4. Rebuild the blog website. First change directory to "aiml-utils/blog", then type: jupyter-book build site (you must have juypter-book, as well as ghp-import installed via pip). Once the site is rebuilt, it will supply you with a local address to view the latest (local) build of the website. When you are happy with your entry, commit the blog to your branch of aiml-utils and issue a pull request. 
12 | 
13 | 5. When the pull request is approved and merged, publish the blog by first changing to the aiml-utils/blog directory. Next, execute the following command (which will ask for your github username and password): `ghp-import -n -p -f site/_build/html`
14 | 
15 | 6. Check the [updated website](https://ncar.github.io/aiml-utils/home.html) for any mistakes or errors.
16 | 
17 | Please direct any questions or comments to John Schreck (schreck@ucar.edu) or David John Gagne (dgagne@ucar.edu).
18 | 
19 | # License
20 | This work is licensed under a GNU General Public License v3.0


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | '''
 2 |     Setup file for the necessary packages to build and install aimlutils.
 3 |     For ease, this can be installed from BitBucket via:
 4 |     pip3 install git+https://[USER_NAME]@https://github.com/NCAR/aiml-utils/aiml-utils.git
 5 | 
 6 |     It is recommeneded that you install this into a Python or Conda Virtual Environment.
 7 | '''
 8 | 
 9 | import codecs
10 | import os
11 | import re
12 | from setuptools import setup, find_packages
13 | 
14 | here = os.path.abspath(os.path.dirname(__file__))
15 | 
16 | def read(*parts):
17 |     with codecs.open(os.path.join(here, *parts), 'r') as fp:
18 |         return fp.read()
19 | 
20 | def find_version(*file_paths):
21 |     version_file = read(*file_paths)
22 |     version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
23 |                               version_file, re.M)
24 |     if version_match:
25 |         return version_match.group(1)
26 |     raise RuntimeError("Unable to find version string.")
27 | 
28 | 
29 |     
30 | with open("README.md") as f:
31 |     long_description = f.read()
32 |     
33 | 
34 | with open("requirements.txt") as f:
35 |     required_libraries = f.read().splitlines()
36 |     
37 | 
38 | setup(
39 |     name='aimlutils',
40 |     version=find_version("./", "__version__.py"),
41 |     author='AIML',
42 |     description=('This repository contains various pieces of code that is shared across different AIML projects, as well as notebooks for blogs'),
43 |     long_description=long_description,
44 |     long_description_content_type='text/markdown',
45 |     url='https://github.com/NCAR/aiml-utils',
46 |     classifiers=[
47 |         "Intended Audience :: Developers",
48 |         "Natural Language :: English",
49 |         "Programming Language :: Python :: 3.7",
50 |     ],
51 |     keywords="",
52 |     install_requires=required_libraries,
53 |     packages=find_packages(exclude=['aimlutils/tests']),
54 | #    test_suite='tests',
55 |     zip_safe=False,
56 | )
57 | 


--------------------------------------------------------------------------------
/aimlutils/echo/examples/torch/hyperparameter.yml:
--------------------------------------------------------------------------------
 1 | log:
 2 |   save_path: "examples/torch/test/log.txt"
 3 | 
 4 | slurm:
 5 |   jobs: 1
 6 |   batch:
 7 |     account: "NAML0001"
 8 |     gres: "gpu:v100:1"
 9 |     mem: "128G"
10 |     n: 8
11 |     t: "12:00:00"
12 |     J: "hyper_opt"
13 |     o: "hyper_opt.out"
14 |     e: "hyper_opt.err"
15 |     
16 | optuna:
17 |   name: "holodec_optimization.db"
18 |   reload: 0
19 |   objective: "/glade/work/schreck/repos/aiml-utils/aimlutils/hyper_opt/examples/torch/objective.py"
20 |   direction: "minimize"
21 |   metric: "val_loss"
22 |   n_trials: 1
23 |   gpu: True
24 |   save_path: 'examples/torch/test'
25 |   sampler:
26 |     type: "TPESampler"
27 |   parameters:
28 |     num_dense:
29 |       type: "int"
30 |       settings:
31 |           name: "num_dense"
32 |           low: 0
33 |           high: 10
34 |     dense_hidden_dim1:
35 |       type: "int"
36 |       settings:
37 |         name: "dense_hidden_dim1"
38 |         low: 10
39 |         high: 10000
40 |     dense_hidden_dim2:
41 |       type: "int"
42 |       settings:
43 |         name: "dense_hidden_dim2"
44 |         low: 10
45 |         high: 5000
46 |     dr1:
47 |       type: "float"
48 |       settings:
49 |         name: "dr1"
50 |         low: 0.0
51 |         high: 0.5
52 |     dr2:
53 |       type: "float"
54 |       settings:
55 |         name: "dr1"
56 |         low: 0.0
57 |         high: 0.5
58 |     trainer:alpha:
59 |       type: "float"
60 |       settings:
61 |         name: "alpha"
62 |         low: 0.001
63 |         high: 1.0
64 |     trainer:beta:
65 |       type: "float"
66 |       settings:
67 |         name: "beta"
68 |         low: 0.001
69 |         high: 1.0
70 |     optimizer:lr:
71 |       type: "loguniform"
72 |       settings:
73 |         name: "lr"
74 |         low: 0.0000001
75 |         high: 0.01
76 |     optimizer:weight_decay:
77 |       type: "loguniform"
78 |       settings:
79 |         name: "weight_decay"
80 |         low: 0.00000001
81 |         high: 0.1


--------------------------------------------------------------------------------
/aimlutils/echo/examples/keras/hyperparameter.yml:
--------------------------------------------------------------------------------
  1 | log:
  2 |   save_path: "examples/keras/results/log.txt"
  3 | 
  4 | slurm:
  5 |   jobs: 1
  6 |   batch:
  7 |     account: "NAML0001"
  8 |     gres: "gpu:v100:1"
  9 |     mem: "128G"
 10 |     n: 8
 11 |     t: "12:00:00"
 12 |     J: "hyper_opt"
 13 |     o: "hyper_opt.out"
 14 |     e: "hyper_opt.err"
 15 |     
 16 | optuna:
 17 |   name: "holodec_optimization.db"
 18 |   reload: 1
 19 |   objective: "examples/keras/objective.py"
 20 |   direction: "minimize"
 21 |   metric: "val_loss"
 22 |   n_trials: 500
 23 |   gpu: True
 24 |   save_path: 'examples/keras/results'
 25 |   sampler:
 26 |     type: "TPESampler"
 27 |   parameters:
 28 |     conv2d_network:lr:
 29 |       type: "loguniform"
 30 |       settings:
 31 |         name: "lr"
 32 |         low: 0.0000001
 33 |         high: 0.01
 34 |     filter1:
 35 |       type: "int"
 36 |       settings:
 37 |           name: "filter1"
 38 |           low: 1
 39 |           high: 64
 40 |     filter2:
 41 |       type: "int"
 42 |       settings:
 43 |           name: "filter2"
 44 |           low: 1
 45 |           high: 64
 46 |     filter3:
 47 |       type: "int"
 48 |       settings:
 49 |           name: "filter3"
 50 |           low: 1
 51 |           high: 64
 52 |     kernel1:
 53 |       type: "int"
 54 |       settings:
 55 |           name: "kernel1"
 56 |           low: 1
 57 |           high: 10
 58 |     kernel2:
 59 |       type: "int"
 60 |       settings:
 61 |           name: "kernel2"
 62 |           low: 1
 63 |           high: 10
 64 |     kernel3:
 65 |       type: "int"
 66 |       settings:
 67 |           name: "kernel3"
 68 |           low: 1
 69 |           high: 10
 70 |     pool1:
 71 |       type: "int"
 72 |       settings:
 73 |           name: "pool1"
 74 |           low: 1
 75 |           high: 50
 76 |     pool2:
 77 |       type: "int"
 78 |       settings:
 79 |           name: "pool1"
 80 |           low: 1
 81 |           high: 50
 82 |     pool3:
 83 |       type: "int"
 84 |       settings:
 85 |           name: "pool1"
 86 |           low: 1
 87 |           high: 50
 88 |     dense1:
 89 |       type: "int"
 90 |       settings:
 91 |         name: "dense1"
 92 |         low: 10
 93 |         high: 10000
 94 |     dense2:
 95 |       type: "int"
 96 |       settings:
 97 |         name: "dense2"
 98 |         low: 10
 99 |         high: 5000
100 | 


--------------------------------------------------------------------------------
/aimlutils/torch/losses/losses.py:
--------------------------------------------------------------------------------
 1 | import torch, logging
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | ###################
 9 | #
10 | # Entropy Losses
11 | #
12 | ###################
13 | 
14 | def loss_fn(recon_x, x, mu, logvar):
15 |     criterion = nn.BCELoss(reduction='sum')    
16 |     BCE = criterion(recon_x, x)
17 |     KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
18 |     return BCE + KLD, BCE, KLD
19 | 
20 | 
21 | class SymmetricCE:
22 |     
23 |     def __init__(self, alpha, gamma, kld_weight = 1.0):
24 |         self.alpha = alpha
25 |         self.gamma = alpha
26 |         self.kld_weight = kld_weight
27 |         
28 |         logger.info(f"Loaded Symmetric Cross Entropy loss ...")
29 |         logger.info(f"... with alpha = {alpha}, gamma = {gamma}, and kld_weight = {kld_weight}")
30 | 
31 |     def __call__(self, recon_x, x, mu, logvar):
32 |         criterion = nn.BCELoss(reduction='sum')    
33 |         BCE = criterion(recon_x, x)
34 |         #KLD = torch.mean(-0.5 * torch.sum(1 + logvar - mu ** 2 - logvar.exp(), dim = 1), dim = 0)
35 |         KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
36 |         return self.alpha * BCE + self.kld_weight * self.gamma * KLD, BCE, KLD
37 |     
38 | class SymmetricMSE:
39 |     
40 |     def __init__(self, alpha, gamma, kld_weight = 1.0):
41 |         self.alpha = alpha
42 |         self.gamma = gamma
43 |         self.kld_weight = kld_weight
44 |         
45 |         logger.info(f"Loaded Symmetric MSE loss ...")
46 |         logger.info(f"... with alpha = {alpha}, gamma = {gamma}, and kld_weight = {kld_weight}")
47 | 
48 |     def __call__(self, recon_x, x, mu, logvar):
49 |         criterion = nn.MSELoss(reduction='sum')  
50 |         BCE = criterion(recon_x, x)
51 |         #KLD = torch.mean(-0.5 * torch.sum(1 + logvar - mu ** 2 - logvar.exp(), dim = 1), dim = 0)
52 |         KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
53 |         return self.alpha * BCE + self.kld_weight * self.gamma * KLD, BCE, KLD
54 |     
55 | 
56 | ###################
57 | #
58 | # Regression Losses - https://github.com/tuantle/regression-losses-pytorch
59 | #
60 | ###################
61 | 
62 | class LogCoshLoss(torch.nn.Module):
63 |     def __init__(self):
64 |         super().__init__()
65 | 
66 |     def forward(self, y_t, y_prime_t):
67 |         ey_t = y_t - y_prime_t
68 |         return torch.mean(torch.log(torch.cosh(ey_t + 1e-12)))
69 | 
70 | 
71 | class XTanhLoss(torch.nn.Module):
72 |     def __init__(self):
73 |         super().__init__()
74 | 
75 |     def forward(self, y_t, y_prime_t):
76 |         ey_t = y_t - y_prime_t
77 |         return torch.mean(ey_t * torch.tanh(ey_t))
78 | 
79 | 
80 | class XSigmoidLoss(torch.nn.Module):
81 |     def __init__(self):
82 |         super().__init__()
83 | 
84 |     def forward(self, y_t, y_prime_t):
85 |         ey_t = y_t - y_prime_t
86 |         return torch.mean(2 * ey_t / (1 + torch.exp(-ey_t)) - ey_t)


--------------------------------------------------------------------------------
/blog/site/_config.yml:
--------------------------------------------------------------------------------
 1 | #######################################################################################
 2 | # Book settings
 3 | title: "Analytics and Integrative Machine Learning"
 4 | author: AIML @ NCAR
 5 | logo: ../NCAR_UCAR_LIVERY/Logos/Contemporary Logos/NCAR/Dark Logo/NCAR-contemp-logo-blue.png
 6 | exclude_patterns            : [_build, Thumbs.db, .DS_Store, "**.ipynb_checkpoints"]
 7 | 
 8 | 
 9 | #######################################################################################
10 | # Execution settings
11 | execute:
12 |   execute_notebooks         : off  # Whether to execute notebooks at build time. Must be one of ("auto", "force", "cache", "off")
13 |   cache                     : ""    # A path to the jupyter cache that will be used to store execution artifacts. Defaults to `_build/.jupyter_cache/`
14 |   exclude_patterns          : []    # A list of patterns to *skip* in execution (e.g. a notebook that takes a really long time)
15 |   timeout                   : 30    # The maximum time (in seconds) each notebook cell is allowed to run.
16 |   run_in_temp               : false # If `True`, then a temporary directory will be created and used as the command working directory (cwd),
17 |                                     # otherwise the notebook's parent directory will be the cwd.
18 |   allow_errors              : false # If `False`, when a code cell raises an error the execution is stopped, otherwise all cells are always run.
19 |   stderr_output             : show  # One of 'show', 'remove', 'remove-warn', 'warn', 'error', 'severe'
20 | 
21 | #######################################################################################
22 | # Parse and render settings
23 | parse:
24 |   myst_enable_extensions:  # default extensions to enable in the myst parser. See https://myst-parser.readthedocs.io/en/latest/using/syntax-optional.html
25 |     # - amsmath
26 |     - colon_fence
27 |     # - deflist
28 |     - dollarmath
29 |     # - html_admonition
30 |     # - html_image
31 |     - linkify
32 |     # - replacements
33 |     # - smartquotes
34 |     - substitution
35 | 
36 |   myst_url_schemes          : [mailto, http, https]  # URI schemes that will be recognised as external URLs in Markdown links
37 | 
38 | #######################################################################################
39 | # HTML-specific settings
40 | html:
41 |   favicon                   : ""  # A path to a favicon image
42 |   use_edit_page_button      : false  # Whether to add an "edit this page" button to pages. If `true`, repository information in repository: must be filled in
43 |   use_repository_button     : false  # Whether to add a link to your repository button
44 |   use_issues_button         : false  # Whether to add an "open an issue" button
45 |   extra_navbar              : Powered by <a href="https://jupyterbook.org">Jupyter Book</a>  # Will be displayed underneath the left navbar.
46 |   extra_footer              : ""  # Will be displayed underneath the footer.
47 |   google_analytics_id       : ""  # A GA id that can be used to track book views.
48 |   home_page_in_navbar       : true  # Whether to include your home page in the left Navigation Bar
49 |   baseurl                   : ""  # The base URL where your book will be hosted. Used for creating image previews and social links. e.g.: https://mypage.com/mybook/
50 |   comments:
51 |     hypothesis              : false
52 |     utterances              : false
53 | 
54 | #######################################################################################
55 | # LaTeX-specific settings
56 | latex:
57 |   latex_engine              : pdflatex  # one of 'pdflatex', 'xelatex' (recommended for unicode), 'luatex', 'platex', 'uplatex'
58 |   use_jupyterbook_latex     : true # use jupyterbook-latex for pdf builds as default
59 | 
60 | #######################################################################################
61 | # Launch button settings
62 | launch_buttons:
63 |   notebook_interface        : jupyterlab  # The interface interactive links will activate ["classic", "jupyterlab"]
64 |   binderhub_url             : https://mybinder.org  # The URL of the BinderHub (e.g., https://mybinder.org)
65 |   jupyterhub_url            : ""  # The URL of the JupyterHub (e.g., https://datahub.berkeley.edu)
66 |   thebe                     : false  # Add a thebe button to pages (requires the repository to run on Binder)
67 |   colab_url                 : "" # The URL of Google Colab (https://colab.research.google.com)
68 | 
69 | repository:
70 |   url                       : https://github.com/NCAR/aiml-utils  # The URL to your book's repository
71 |   path_to_book              : blog/site  # A path to your book's folder, relative to the repository root.
72 |   branch                    : master  # Which branch of the repository should be used when creating links


--------------------------------------------------------------------------------
/aimlutils/echo/examples/keras/objective.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | warnings.filterwarnings("ignore")
  3 | 
  4 | import copy
  5 | import optuna
  6 | import logging
  7 | import traceback
  8 | 
  9 | from aimlutils.echo.src.base_objective import *
 10 | from .data_generator import DataGenerator
 11 | from model import Conv2DNeuralNetwork
 12 | 
 13 | from holodecml.callbacks import get_callbacks
 14 | from aimlutils.echo.src.utils import KerasPruningCallback
 15 | 
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | def custom_updates(trial, conf):
 21 |     
 22 |     # Get list of hyperparameters from the config
 23 |     hyperparameters = conf["optuna"]["parameters"]
 24 |     
 25 |     # Now update some via custom rules
 26 |     filter1 = trial.suggest_int(**hyperparameters["filter1"]["settings"])
 27 |     filter2 = trial.suggest_int(**hyperparameters["filter2"]["settings"])
 28 |     filter3 = trial.suggest_int(**hyperparameters["filter3"]["settings"])
 29 |     kernel1 = trial.suggest_int(**hyperparameters["kernel1"]["settings"])
 30 |     kernel2 = trial.suggest_int(**hyperparameters["kernel2"]["settings"])
 31 |     kernel3 = trial.suggest_int(**hyperparameters["kernel3"]["settings"])
 32 |     pool1 = trial.suggest_int(**hyperparameters["pool1"]["settings"])
 33 |     pool2 = trial.suggest_int(**hyperparameters["pool2"]["settings"])
 34 |     pool3 = trial.suggest_int(**hyperparameters["pool3"]["settings"])
 35 |     dense1 = trial.suggest_int(**hyperparameters["dense1"]["settings"])
 36 |     dense2 = trial.suggest_int(**hyperparameters["dense2"]["settings"])
 37 |     
 38 |     conf["conv2d_network"]["filters"] = [filter1, filter2, filter3]
 39 |     conf["conv2d_network"]["kernel_sizes"] = [kernel1, kernel2, kernel3]
 40 |     conf["conv2d_network"]["pool_sizes"] = [pool1, pool2, pool3]
 41 |     conf["conv2d_network"]["dense_sizes"] = [dense1, dense2]
 42 |     
 43 |     return conf
 44 | 
 45 | 
 46 | class Objective(BaseObjective):
 47 |     
 48 |     def __init__(self, study, config, metric = "val_loss", device = "cpu"):
 49 |         
 50 |         # Initialize the base class
 51 |         BaseObjective.__init__(self, study, config, metric, device)
 52 | 
 53 | 
 54 |     def train(self, trial, conf):   
 55 |         
 56 |         # Custom updates
 57 |         conf = custom_updates(trial, conf)
 58 |         
 59 |         # Set up some globals
 60 |         path_data = conf["path_data"]
 61 |         num_particles = conf["num_particles"]
 62 |         split = 'train'
 63 |         subset = False
 64 |         output_cols = ["x", "y", "z", "d", "hid"]
 65 | 
 66 |         input_shape = (600, 400, 1)
 67 |         batch_size = conf["conv2d_network"]["batch_size"]
 68 |         n_particles = conf["num_particles"]
 69 |         output_channels = len(output_cols) - 1
 70 |         
 71 |         # Load the data
 72 |         train_gen = DataGenerator(
 73 |             path_data, num_particles, "train", subset, 
 74 |             output_cols, batch_size, maxnum_particles = 3, shuffle = False
 75 |         )
 76 |         train_scalers = train_gen.get_transform()
 77 |         valid_gen = DataGenerator(
 78 |             path_data, num_particles, "test", subset, 
 79 |             output_cols, batch_size, scaler = train_scalers, maxnum_particles = 3, shuffle = False
 80 |         )
 81 |         
 82 |         # Load the model
 83 |         model = Conv2DNeuralNetwork(**conf["conv2d_network"])    
 84 |         model.build_neural_network(input_shape, n_particles, output_channels)
 85 | 
 86 |         # Load callbacks
 87 |         callbacks = get_callbacks(conf["callbacks"])
 88 |         
 89 |         # Load optuna keras pruning callback
 90 |         pruning_callback = KerasPruningCallback(trial, self.metric)
 91 |         callbacks.append(pruning_callback)
 92 |         
 93 |         # Train a model
 94 |         try: # Aim to catch instances when the GPU memory overflows
 95 |             blackbox = model.model.fit(
 96 |                 train_gen,
 97 |                 validation_data=valid_gen,
 98 |                 epochs=conf["conv2d_network"]["epochs"],
 99 |                 verbose=True,
100 |                 callbacks=callbacks,
101 |                 use_multiprocessing=True,
102 |                 workers=8,
103 |                 max_queue_size=100
104 |             )
105 |         except: # When that happens, let optuna consider it as a pruned trial
106 |             raise optuna.TrialPruned()
107 |         
108 |         if trial.should_prune():
109 |             raise optuna.TrialPruned()
110 | 
111 |         # Return the validation accuracy for the last epoch.
112 |         objective = blackbox.history[self.metric][-1]
113 |         
114 |         results_dictionary = {
115 |             self.metric: objective
116 |         }
117 |         
118 |         return results_dictionary
119 | 


--------------------------------------------------------------------------------
/aimlutils/echo/src/base_objective.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | warnings.filterwarnings("ignore")
  3 | 
  4 | from aimlutils.echo.src.trial_suggest import trial_suggest_loader
  5 | from collections import defaultdict
  6 | import copy, os, sys, random
  7 | import pandas as pd 
  8 | import logging
  9 | import optuna
 10 | 
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | def recursive_update(nested_keys, dictionary, update):
 16 |     if isinstance(dictionary, dict) and len(nested_keys) > 1:
 17 |         recursive_update(nested_keys[1:], dictionary[nested_keys[0]], update)
 18 |     else:
 19 |         dictionary[nested_keys[0]] = update
 20 | 
 21 | 
 22 | class BaseObjective:
 23 |     
 24 |     def __init__(self, config, metric = "val_loss", device = "cpu"):
 25 |         
 26 |         self.config = config
 27 |         self.metric = metric
 28 |         self.device = f"cuda:{device}" if device != "cpu" else "cpu"
 29 |         
 30 |         self.results = defaultdict(list)
 31 |         save_path = config["optuna"]["save_path"]
 32 |         self.results_fn = os.path.join(save_path, f"hyper_opt_{random.randint(0, 1e5)}.csv")
 33 |         while os.path.isfile(self.results_fn):
 34 |             rand_index = random.randint(0, 1e5)
 35 |             self.results_fn = os.path.join(save_path, f"hyper_opt_{rand_index}.csv")
 36 |             
 37 |         logger.info(f"Initialized an objective to be optimized with metric {metric}")
 38 |         logger.info(f"Using device {device}")
 39 |         logger.info(f"Saving study/trial results to local file {self.results_fn}")
 40 |     
 41 |     def update_config(self, trial):
 42 |         
 43 |         logger.info(
 44 |             f"Attempting to automatically update the model configuration using optuna's suggested parameters"
 45 |         )
 46 |         
 47 |         # Make a copy the config that we can edit
 48 |         conf = copy.deepcopy(self.config)
 49 | 
 50 |         # Update the fields that can be matched automatically (through the name field)
 51 |         updated = []
 52 |         hyperparameters = conf["optuna"]["parameters"]
 53 |         for named_parameter, update in hyperparameters.items():
 54 |             if ":" in named_parameter:
 55 |                 recursive_update(
 56 |                     named_parameter.split(":"), 
 57 |                     conf,
 58 |                     trial_suggest_loader(trial, update))
 59 |                 updated.append(named_parameter)
 60 |             else:
 61 |                 if named_parameter in conf:
 62 |                     conf[named_parameter] = trial_suggest_loader(trial, update)
 63 |                     updated.append(named_parameter)
 64 |                     
 65 |         logger.info(f"Those that got updated automatically: {updated}")
 66 |         return conf
 67 |         
 68 | #     #Deprecated as of writing of report.py script 
 69 | 
 70 |     def save(self, trial, results_dict):
 71 |         
 72 |         # Make sure the relevant metric was placed into the results dictionary
 73 |         single_objective = isinstance(self.metric, str)
 74 |         if single_objective:
 75 |             if self.metric not in results_dict:
 76 |                 raise OSError(
 77 |                     "You must return the metric result to the hyperparameter optimizer"
 78 |                 )
 79 |         else:
 80 |             for metric in self.metric:
 81 |                 if metric not in results_dict:
 82 |                     raise OSError(
 83 |                         "You must return the metric result to the hyperparameter optimizer"
 84 |                     )
 85 |         
 86 |         # Save the hyperparameters used in the trial
 87 |         self.results["trial"].append(trial.number)
 88 |         for param, value in trial.params.items():
 89 |             self.results[param].append(value)
 90 |         
 91 |         # Save the metric and "other metrics"
 92 |         for metric, value in results_dict.items():
 93 |             self.results[metric].append(value)
 94 |             
 95 |         # Save pruning boolean
 96 |         self.results["pruned"] = int(trial.should_prune())
 97 |         #self.results["complete"] = int(trial.state == optuna.trial.TrialState.COMPLETE)
 98 |         
 99 |         # Save the df of results to disk
100 |         pd.DataFrame.from_dict(self.results).to_csv(self.results_fn)
101 |         
102 |         logger.info(
103 |             f"Saving trial {trial.number} results to local file {self.results_fn}"
104 |         )
105 |         
106 |         if single_objective:
107 |             return results_dict[self.metric]
108 |         else:
109 |             return [result[metric] for metric in self.metric]
110 |     
111 |     def __call__(self, trial):
112 |         
113 |         # Automatically update the config, when possible
114 |         conf = self.update_config(trial)
115 |         
116 |         # Train the model
117 |         logger.info(
118 |             f"Beginning to train the model using the latest parameters from optuna"
119 |         )
120 |         
121 |         result = self.train(trial, conf)
122 |         
123 |         return self.save(trial, result)
124 |     
125 |     def train(self, trial, conf):
126 |         raise NotImplementedError


--------------------------------------------------------------------------------
/aimlutils/torch/checkpoint/checkpointer.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from typing import List, Dict
  3 | import pandas as pd
  4 | import numpy as np
  5 | import logging
  6 | import torch
  7 | import time
  8 | import math
  9 | import glob
 10 | import os
 11 | 
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | def load_checkpoint(checkpoint_path: str):
 17 |     # It's weird that if `map_location` is not given, it will be extremely slow.
 18 |     return torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
 19 | 
 20 | 
 21 | class EarlyStopping:
 22 |     """Early stops the training if validation loss doesn't improve after a given patience."""
 23 | 
 24 |     def __init__(self,
 25 |                  patience=7,
 26 |                  verbose=False,
 27 |                  delta=0,
 28 |                  save_every_epoch=False,
 29 |                  path_save='checkpoint.pt',
 30 |                  tag = None):
 31 |         """
 32 |         Args:
 33 |             patience (int): How long to wait after last time validation loss improved.
 34 |                             Default: 7
 35 |             verbose (bool): If True, prints a message for each validation loss improvement. 
 36 |                             Default: False
 37 |             delta (float): Minimum change in the monitored quantity to qualify as an improvement.
 38 |                             Default: 0
 39 |             path (str): Path for the checkpoint to be saved to.
 40 |                             Default: 'checkpoint.pt'
 41 |             trace_func (function): trace print function.
 42 |                             Default: print            
 43 |         """
 44 |         self.patience = patience
 45 |         self.verbose = verbose
 46 |         self.counter = 0
 47 |         self.best_score = None
 48 |         self.early_stop = False
 49 |         self.val_loss_min = np.Inf
 50 |         self.delta = delta
 51 |         self.path = path_save
 52 |         self.dirpath = os.path.dirname(self.path)
 53 |         self.save_every_epoch = save_every_epoch
 54 |         self.tag = tag
 55 | 
 56 |         logger.info(
 57 |             f"Loaded EarlyStopping checkpointer with patience {self.patience}")
 58 | 
 59 |     def __call__(self, epoch, val_loss, model, optimizer):
 60 | 
 61 |         score = val_loss
 62 | 
 63 |         if self.best_score is None:
 64 |             self.best_score = score
 65 |             self.save_checkpoint(epoch, val_loss, model, optimizer, best=True)
 66 |         elif score < (self.best_score + self.delta):
 67 |             self.best_score = score
 68 |             self.save_checkpoint(epoch, val_loss, model, optimizer, best=True)
 69 |             self.counter = 0
 70 |         else:
 71 |             self.counter += 1
 72 |             logger.info(
 73 |                 f'EarlyStopping counter: {self.counter} out of {self.patience}')
 74 |             if self.save_every_epoch:
 75 |                 self.save_checkpoint(
 76 |                     epoch, val_loss, model, optimizer, best=False)
 77 |             if self.counter >= self.patience:
 78 |                 self.early_stop = True
 79 | 
 80 |     def save_checkpoint(self, epoch, val_loss, model, optimizer, best=False):
 81 |         '''Saves model when validation loss decrease.'''
 82 |         if best:
 83 |             logger.info(
 84 |                 f'Validation loss decreased on epoch {epoch} ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model.'
 85 |             )
 86 |         checkpoint = {
 87 |             'epoch': epoch,
 88 |             'val_loss': val_loss,
 89 |             'model_state_dict': model.state_dict(),
 90 |             'optimizer_state_dict': optimizer.state_dict(),
 91 |             'lr': self.print_learning_rate(optimizer)
 92 |         }
 93 |         if not best:  # save a model, not the best one seen so far
 94 |             if self.tag is not None:
 95 |                 save_path = os.path.join(self.dirpath, f"checkpoint_{self.tag}.pt")
 96 |             else:
 97 |                 save_path = os.path.join(self.dirpath, "checkpoint.pt")
 98 |             torch.save(checkpoint, save_path)
 99 |         else:  # save best model so far
100 |             if self.tag is not None:
101 |                 save_path = os.path.join(self.dirpath, f"best_{self.tag}.pt")
102 |             else:
103 |                 save_path = os.path.join(self.dirpath, "best.pt")
104 |             torch.save(checkpoint, save_path)
105 |             self.val_loss_min = val_loss
106 | 
107 |     def print_learning_rate(self, optimizer):
108 |         for param_group in optimizer.param_groups:
109 |             return param_group["lr"]
110 | 
111 | 
112 | class MetricsLogger:
113 | 
114 |     def __init__(self, path_save: str, reload: bool = False) -> None:
115 | 
116 |         self.path_save = os.path.join(f"{path_save}", "training_log.csv")
117 | 
118 |         if reload:
119 |             self.load()
120 |             logger.info(
121 |                 f"Loaded a previous metrics file from {self.path_save}")
122 |         else:
123 |             self.metrics = defaultdict(list)
124 |             logger.info(
125 |                 f"Loaded a metrics logger {self.path_save} to track the training results")
126 | 
127 |     def update(self, data: Dict[str, float]) -> None:
128 |         for key, value in data.items():
129 |             self.metrics[key].append(value)
130 |         self.save()
131 | 
132 |     def to_pandas(self) -> pd.DataFrame:
133 |         return pd.DataFrame.from_dict(self.metrics)
134 | 
135 |     def save(self) -> None:
136 |         self.to_pandas().to_csv(
137 |             self.path_save,
138 |             sep=',',
139 |             encoding='utf-8',
140 |             index=None
141 |         )
142 | 
143 |     def load(self) -> None:
144 |         self.metrics = pd.read_csv(
145 |             self.path_save,
146 |             sep=',',
147 |             encoding='utf-8'
148 |         ).to_dict()
149 | 


--------------------------------------------------------------------------------
/aimlutils/echo/examples/keras/data_generator.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys 
  3 | import yaml
  4 | import math
  5 | import time
  6 | import random
  7 | import traceback
  8 | import xarray as xr
  9 | import numpy as np
 10 | import pandas as pd
 11 | from datetime import datetime
 12 | 
 13 | import matplotlib.pyplot as plt
 14 | import scipy.sparse
 15 | from scipy.ndimage import gaussian_filter
 16 | 
 17 | from tqdm.auto import tqdm
 18 | 
 19 | import numpy.fft as FFT
 20 | from typing import List, Dict
 21 | 
 22 | from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
 23 | from tensorflow.keras.layers import (Input, Conv2D, Dense, Flatten, 
 24 |                                      MaxPool2D, RepeatVector, Lambda,
 25 |                                      LeakyReLU, Dropout)
 26 | from tensorflow.keras.models import Model, save_model
 27 | from tensorflow.keras.optimizers import Adam, SGD
 28 | import tensorflow.keras.backend as K
 29 | 
 30 | from keras_radam import RAdam
 31 | from keras_radam.training import RAdamOptimizer
 32 | 
 33 | import tensorflow as tf
 34 | 
 35 | 
 36 | num_particles_dict = {
 37 |     1: '1particle',
 38 |     3: '3particle',
 39 |     'multi': 'multiparticle',
 40 |     '50-100': '50-100'}
 41 | 
 42 | split_dict = {
 43 |     'train' : 'training',
 44 |     'test'   : 'test',
 45 |     'valid': 'validation'}
 46 | 
 47 | 
 48 | class DataGenerator(tf.keras.utils.Sequence):
 49 |     'Generates data for Keras'
 50 |     def __init__(
 51 |         
 52 |         self, 
 53 |         path_data: str, 
 54 |         num_particles: int, 
 55 |         split: str, 
 56 |         subset: bool, 
 57 |         output_cols: List[str], 
 58 |         batch_size: int, 
 59 |         shuffle: bool = True,
 60 |         maxnum_particles: int = False,
 61 |         scaler: Dict[str, str] = False) -> None:
 62 |         
 63 |         'Initialization'
 64 |         self.ds = self.open_dataset(path_data, num_particles, split)
 65 |         self.batch_size = batch_size
 66 |         self.output_cols = [x for x in output_cols if x != 'hid']        
 67 |         self.subset = subset
 68 |         self.hologram_numbers = self.ds.hologram_number.values
 69 |         if shuffle:
 70 |             random.shuffle(self.hologram_numbers)
 71 |         self.num_particles = num_particles
 72 |         self.xsize = len(self.ds.xsize.values)
 73 |         self.ysize = len(self.ds.ysize.values)
 74 |         self.shuffle = shuffle
 75 |         self.maxnum_particles = maxnum_particles
 76 |                 
 77 |         if not scaler:
 78 |             self.scaler = {col: StandardScaler() for col in output_cols}
 79 |             for col in output_cols:
 80 |                 scale = self.ds[col].values
 81 |                 self.scaler[col].fit(scale.reshape(scale.shape[-1], -1))
 82 |         else:
 83 |             self.scaler = scaler
 84 |         
 85 |     def get_transform(self):
 86 |         return self.scaler
 87 | 
 88 |     def __len__(self):
 89 |         'Denotes the number of batches per epoch'
 90 |         return math.ceil(len(self.hologram_numbers) / self.batch_size)
 91 |     
 92 |     def __getitem__(self, idx):
 93 |         'Generate one batch of data'
 94 |         holograms = self.hologram_numbers[
 95 |             idx * self.batch_size: (idx + 1) * self.batch_size
 96 |         ]
 97 |         x_out, y_out, w_out = self._batch(holograms)
 98 |         return x_out, y_out, w_out
 99 |     
100 |     def on_epoch_end(self):
101 |         'Updates indexes after each epoch'
102 |         if self.shuffle == True:
103 |             random.shuffle(self.hologram_numbers)
104 |             
105 |     def _batch(self, holograms: List[int]):
106 |         'Create a batch of data'
107 |         try:
108 |         
109 |             x_out = np.zeros((
110 |                 len(holograms), self.xsize, self.ysize
111 |             ))
112 |             y_out = np.zeros((
113 |                 len(holograms), 
114 |                 self.maxnum_particles if self.maxnum_particles else self.num_particles, 
115 |                 len(self.output_cols)
116 |             ))
117 |             # Move the scaler.transform to here
118 |             
119 |             a = time.time()
120 |             for k, hologram in enumerate(holograms):
121 |                 im = self.ds["image"][hologram].values
122 |                 x_out[k] = (im-np.mean(im)) / (np.std(im))
123 |                 #A = np.log(np.abs(OpticsFFT(A)))                    
124 |                 particles = np.where(self.ds["hid"] == hologram + 1)[0]  
125 |                 for l, p in enumerate(particles):
126 |                     for m, col in enumerate(self.output_cols):
127 |                         val = self.ds[col][p].values
128 |                         y_out[k, l, m] = self.scaler[col].transform(
129 |                             val.reshape(1, -1)
130 |                         )
131 |                 if self.maxnum_particles and len(particles) < self.maxnum_particles:
132 |                     for l in range(len(particles), self.maxnum_particles):
133 |                         for m, col in enumerate(self.output_cols):
134 |                             val = y_out[k, l, m]
135 |                             y_out[k, l, m] = self.scaler[col].transform(
136 |                                 val.reshape(1, -1)
137 |                             )
138 |             #
139 |             # convert y_out to sparse if we are using padding
140 | #             if self.maxnum_particles:
141 | #                 y_out = sparse_vstack([
142 | #                     csr_matrix(y_out[i]) for i in y_out.shape[0]
143 | #                 ])
144 |             
145 |             x_out = np.expand_dims(x_out, axis=-1)
146 |             return x_out, y_out, [None] #class weights option
147 |         
148 |         except:
149 |             print(traceback.print_exc())
150 |     
151 |     def open_dataset(self, path_data, num_particles, split):
152 |         """
153 |         Opens a HOLODEC file
154 | 
155 |         Args: 
156 |             path_data: (str) Path to dataset directory
157 |             num_particles: (int or str) Number of particles per hologram
158 |             split: (str) Dataset split of either 'train', 'valid', or 'test'
159 | 
160 |         Returns:
161 |             ds: (xarray Dataset) Opened dataset
162 |         """
163 |         path_data = os.path.join(path_data, self.dataset_name(num_particles, split))
164 | 
165 |         if not os.path.isfile(path_data):
166 |             print(f"Data file does not exist at {path_data}. Exiting.")
167 |             raise 
168 | 
169 |         ds = xr.open_dataset(path_data)
170 |         return ds
171 |     
172 |     def dataset_name(self, num_particles, split, file_extension='nc'):
173 |         """
174 |         Return the dataset filename given user inputs
175 | 
176 |         Args: 
177 |             num_particles: (int or str) Number of particles per hologram
178 |             split: (str) Dataset split of either 'train', 'valid', or 'test'
179 |             file_extension: (str) Dataset file extension
180 | 
181 |         Returns:
182 |             ds_name: (str) Dataset name
183 |         """
184 | 
185 |         valid = [1,3,'multi','50-100']
186 |         if num_particles not in valid:
187 |             raise ValueError("results: num_particles must be one of %r." % valid)
188 |         num_particles = num_particles_dict[num_particles]
189 | 
190 |         valid = ['train','test','valid']
191 |         if split not in valid:
192 |             raise ValueError("results: split must be one of %r." % valid)
193 |         split = split_dict[split]
194 |         ds_name = f'synthetic_holograms_{num_particles}_{split}.{file_extension}'
195 | 
196 |         return ds_name


--------------------------------------------------------------------------------
/aimlutils/torch/trainers/trainers.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import yaml
  4 | import tqdm
  5 | import torch
  6 | import pickle
  7 | import logging
  8 | 
  9 | from torchvision.utils import save_image
 10 | from holodecml.vae.losses import *
 11 | 
 12 | import numpy as np
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | class BaseTrainer:
 18 |     
 19 |     def __init__(self, 
 20 |                  model, 
 21 |                  optimizer,
 22 |                  train_gen, 
 23 |                  valid_gen, 
 24 |                  dataloader, 
 25 |                  valid_dataloader,
 26 |                  start_epoch = 0,
 27 |                  epochs = 100,
 28 |                  device = "cpu",
 29 |                  clip = 1.0,
 30 |                  alpha = 1.0, 
 31 |                  beta = 1.0, 
 32 |                  kld_weight = [],
 33 |                  path_save = "./",
 34 |                  test_image = None):
 35 |         
 36 |         self.model = model
 37 |         self.optimizer = optimizer
 38 |         self.train_gen = train_gen
 39 |         self.valid_gen = valid_gen
 40 |         self.dataloader = dataloader
 41 |         self.valid_dataloader = valid_dataloader
 42 |         self.batch_size = dataloader.batch_size
 43 |         self.path_save = path_save
 44 |         self.device = device
 45 |         
 46 |         self.start_epoch = start_epoch 
 47 |         self.epochs = epochs
 48 |         
 49 |         self.alpha = alpha
 50 |         self.beta = beta
 51 |         
 52 |         self.kld_weight = kld_weight
 53 |         if len(kld_weight) == 0:
 54 |             self.kld_weight = [
 55 |                 self.batch_size/self.train_gen.__len__(),
 56 |                 self.batch_size/self.valid_gen.__len__()
 57 |             ]
 58 |         self.criterion_train = SymmetricMSE(
 59 |             self.alpha, self.beta, self.kld_weight[0]
 60 |         )
 61 |         self.criterion_test = SymmetricMSE(
 62 |             self.alpha, self.beta, self.kld_weight[1]
 63 |         )
 64 |         
 65 |         self.test_image = test_image
 66 |         
 67 |         # Gradient clipping through hook registration
 68 |         for p in self.model.parameters():
 69 |             p.register_hook(lambda grad: torch.clamp(grad, -clip, clip))
 70 |         logger.info(f"Clipping gradients to range [-{clip}, {clip}]")
 71 |         
 72 |         # Create the save directory if it does not exist
 73 |         try:
 74 |             os.makedirs(path_save)
 75 |         except:
 76 |             pass
 77 |         
 78 |         
 79 |     def train_one_epoch(self, epoch):
 80 | 
 81 |         self.model.train()
 82 |         batches_per_epoch = int(np.ceil(self.train_gen.__len__() / self.batch_size))
 83 |         batch_group_generator = tqdm.tqdm(
 84 |             enumerate(self.dataloader),
 85 |             total=batches_per_epoch, 
 86 |             leave=True
 87 |         )
 88 | 
 89 |         epoch_losses = {"loss": [], "bce": [], "kld": []}
 90 |         for idx, images in batch_group_generator:
 91 | 
 92 |             images = images.to(self.device)
 93 |             recon_images, mu, logvar = self.model(images)
 94 |             loss, bce, kld = self.criterion_train(recon_images, images, mu, logvar)
 95 | 
 96 |             self.optimizer.zero_grad()
 97 |             loss.backward()
 98 |             self.optimizer.step()
 99 | 
100 |             batch_loss = loss.item() #/ self.batch_size
101 |             bce_loss = bce.item() #/ self.batch_size
102 |             kld_loss = kld.item() #/ self.batch_size
103 | 
104 |             epoch_losses["loss"].append(batch_loss)
105 |             epoch_losses["bce"].append(bce_loss)
106 |             epoch_losses["kld"].append(kld_loss)
107 | 
108 |             loss = np.mean(epoch_losses["loss"])
109 |             bce = np.mean(epoch_losses["bce"])
110 |             kld = np.mean(epoch_losses["kld"])
111 | 
112 |             to_print = "loss: {:.3f} bce: {:.3f} kld: {:.3f}".format(loss, bce, kld)
113 |             batch_group_generator.set_description(to_print)
114 |             batch_group_generator.update()
115 | 
116 |         return loss, bce, kld
117 | 
118 | 
119 |     def test(self, epoch):
120 | 
121 |         self.model.eval()
122 |         batches_per_epoch = int(np.ceil(self.valid_gen.__len__() / self.batch_size))
123 | 
124 |         with torch.no_grad():
125 | 
126 |             batch_group_generator = tqdm.tqdm(
127 |                 enumerate(self.valid_dataloader),
128 |                 total=batches_per_epoch, 
129 |                 leave=True
130 |             )
131 | 
132 |             epoch_losses = {"loss": [], "bce": [], "kld": []}
133 |             for idx, images in batch_group_generator:
134 | 
135 |                 images = images.to(self.device)
136 |                 recon_images, mu, logvar = self.model(images)
137 |                 loss, bce, kld = self.criterion_test(recon_images, images, mu, logvar)
138 | 
139 |                 batch_loss = loss.item() #/ self.batch_size
140 |                 bce_loss = bce.item() #/ self.batch_size
141 |                 kld_loss = kld.item() #/ self.batch_size
142 | 
143 |                 epoch_losses["loss"].append(batch_loss)
144 |                 epoch_losses["bce"].append(bce_loss)
145 |                 epoch_losses["kld"].append(kld_loss)
146 | 
147 |                 loss = np.mean(epoch_losses["loss"])
148 |                 bce = np.mean(epoch_losses["bce"])
149 |                 kld = np.mean(epoch_losses["kld"])
150 | 
151 |                 to_print = "val_loss: {:.3f} val_bce: {:.3f} val_kld: {:.3f}".format(loss, bce, kld)
152 |                 batch_group_generator.set_description(to_print)
153 |                 batch_group_generator.update()
154 | 
155 |             if os.path.isfile(self.test_image):
156 |                 with open(self.test_image, "rb") as fid:
157 |                     pic = pickle.load(fid)
158 |                 self.compare(epoch, pic)
159 | 
160 |         return loss, bce, kld
161 |     
162 |     
163 |     def compare(self, epoch, x):
164 |         x = x.to(self.device)
165 |         recon_x, _, _ = self.model(x)
166 |         compare_x = torch.cat([x, recon_x])
167 |         save_image(compare_x.data.cpu(), f'{self.path_save}/image_epoch_{epoch}.png')
168 |         
169 | 
170 |     def train(self,
171 |               scheduler,
172 |               early_stopping,
173 |               metrics_logger):
174 |         
175 |         logger.info(
176 |             f"Training the model for up to {self.epochs} epochs starting at epoch {self.start_epoch}"
177 |         )
178 |         
179 |         flag = isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
180 |         
181 |         for epoch in range(self.start_epoch, self.epochs):
182 | 
183 |             train_loss, train_bce, train_kld = self.train_one_epoch(epoch)
184 |             test_loss, test_bce, test_kld = self.test(epoch)
185 | 
186 |             scheduler.step(test_loss if flag else epoch)
187 |             early_stopping(epoch, test_loss, self.model, self.optimizer)
188 | 
189 |             # Write results to the callback logger 
190 |             result = {
191 |                 "epoch": epoch,
192 |                 "train_loss": train_loss,
193 |                 "train_bce": train_bce,
194 |                 "train_kld": train_kld,
195 |                 "valid_loss": test_loss,
196 |                 "valid_bce": test_bce,
197 |                 "valid_kld": test_kld,
198 |                 "lr": early_stopping.print_learning_rate(self.optimizer)
199 |             }
200 |             metrics_logger.update(result)
201 | 
202 |             if early_stopping.early_stop:
203 |                 logger.info("Early stopping")
204 |                 break
205 | 


--------------------------------------------------------------------------------
/aimlutils/echo/examples/torch/objective.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | warnings.filterwarnings("ignore")
  3 | 
  4 | import copy
  5 | import optuna
  6 | import logging
  7 | import traceback
  8 | 
  9 | from overrides import overrides
 10 | from holodecml.vae.losses import *
 11 | from holodecml.vae.visual import *
 12 | from holodecml.vae.models import *
 13 | from holodecml.vae.trainers import *
 14 | from holodecml.vae.transforms import *
 15 | from holodecml.vae.optimizers import *
 16 | from holodecml.vae.data_loader import *
 17 | from holodecml.vae.checkpointer import *
 18 | from aimlutils.hyper_opt.base_objective import *
 19 | 
 20 | from torch import nn
 21 | from torch.optim.lr_scheduler import *
 22 | from torch.utils.data import Dataset, DataLoader
 23 | from typing import List, Dict, Callable, Union, Any, TypeVar, Tuple
 24 | 
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | 
 29 | def custom_updates(trial, conf):
 30 |     
 31 |     # Get list of hyperparameters from the config
 32 |     hyperparameters = conf["optuna"]["parameters"]
 33 |     
 34 |     # Now update some via custom rules
 35 |     num_dense = trial_suggest_loader(trial, hyperparameters["num_dense"]) 
 36 |     dense1 = trial_suggest_loader(trial, hyperparameters['dense_hidden_dim1'])
 37 |     dense2 = trial_suggest_loader(trial, hyperparameters['dense_hidden_dim2'])
 38 |     dr1 = trial_suggest_loader(trial, hyperparameters['dr1'])
 39 |     dr2 = trial_suggest_loader(trial, hyperparameters['dr2'])
 40 |     
 41 |     # Update the config based on optuna suggestions
 42 |     conf["model"]["dense_hidden_dims"] = [dense1] + [dense2 for k in range(num_dense)]        
 43 |     conf["model"]["dense_dropouts"] = [dr1] + [dr2 for k in range(num_dense)]
 44 |     return conf     
 45 | 
 46 | 
 47 | class Objective(BaseObjective):
 48 |     
 49 |     def __init__(self, study, config, metric = "val_loss", device = "cpu"):
 50 |         
 51 |         BaseObjective.__init__(self, study, config, metric, device)
 52 |         
 53 |         if self.device != "cpu":
 54 |             torch.backends.cudnn.benchmark = True
 55 | 
 56 | 
 57 |     def train(self, trial, conf):   
 58 |         
 59 |         ###########################################################
 60 |         #
 61 |         # Implement custom changes to config
 62 |         #
 63 |         ###########################################################
 64 |         
 65 |         conf = custom_updates(trial, conf)
 66 |                 
 67 |         ###########################################################
 68 |         #
 69 |         # Load ML pipeline, train the model, and return the result
 70 |         #
 71 |         ###########################################################
 72 |         
 73 |         # Load custom option for the VAE/compressor models
 74 |         model_type = conf["type"]
 75 | 
 76 |         # Load image transformations.
 77 |         transform = LoadTransformations(conf["transforms"], device = self.device)
 78 | 
 79 |         # Load dataset readers
 80 |         train_gen = LoadReader(
 81 |             reader_type = model_type,
 82 |             split = "train",
 83 |             transform = transform,
 84 |             scaler = None,
 85 |             config = conf["data"]
 86 |         )
 87 | 
 88 |         valid_gen = LoadReader(
 89 |             reader_type = model_type, 
 90 |             split = "test", 
 91 |             transform = transform, 
 92 |             scaler = train_gen.get_transform(),
 93 |             config = conf["data"],
 94 |         )
 95 | 
 96 |         # Load data iterators from pytorch
 97 |         n_workers = conf['iterator']['num_workers']
 98 | 
 99 |         #logging.info(f"Loading training data iterator using {n_workers} workers")
100 | 
101 |         dataloader = DataLoader(
102 |             train_gen,
103 |             **conf["iterator"]
104 |         )
105 | 
106 |         valid_dataloader = DataLoader(
107 |             valid_gen,
108 |             **conf["iterator"]
109 |         )
110 | 
111 |         # Load the model 
112 |         model = LoadModel(model_type, conf["model"], self.device)
113 | 
114 |         # Load the optimizer
115 |         optimizer_config = conf["optimizer"]
116 |         optimizer = LoadOptimizer(
117 |             optimizer_config["type"], 
118 |             model.parameters(), 
119 |             optimizer_config["lr"], 
120 |             optimizer_config["weight_decay"]
121 |         )
122 | 
123 |         # Load the trainer
124 |         trainer = CustomTrainer(
125 |             model = model,
126 |             optimizer = optimizer,
127 |             train_gen = train_gen,
128 |             valid_gen = valid_gen,
129 |             dataloader = dataloader,
130 |             valid_dataloader = valid_dataloader,
131 |             device = self.device,
132 |             **conf["trainer"]
133 |         )
134 | 
135 |         # Initialize LR annealing scheduler
136 |         if "ReduceLROnPlateau" in conf["callbacks"]:
137 |             schedule_config = conf["callbacks"]["ReduceLROnPlateau"]
138 |             scheduler = ReduceLROnPlateau(trainer.optimizer, **schedule_config)
139 |             logging.info(
140 |                 f"Loaded ReduceLROnPlateau learning rate annealer with patience {schedule_config['patience']}"
141 |             )
142 |         elif "ExponentialLR" in conf["callbacks"]:
143 |             schedule_config = conf["callbacks"]["ExponentialLR"]
144 |             scheduler = ExponentialLR(trainer.optimizer, **schedule_config)
145 |             logging.info(
146 |                 f"Loaded ExponentialLR learning rate annealer with reduce factor {schedule_config['gamma']}"
147 |             )
148 | 
149 |         # Initialize early stopping
150 |         checkpoint_config = conf["callbacks"]["EarlyStopping"]
151 |         early_stopping = EarlyStopping(**checkpoint_config)
152 | 
153 |         # Train the model
154 |         val_loss, val_mse, val_bce, val_acc = trainer.train(
155 |             trial, scheduler, early_stopping, self.metric
156 |         )
157 |         
158 |         results = {
159 |             "val_loss": val_loss, 
160 |             "val_mse": val_mse, 
161 |             "val_bce": val_bce, 
162 |             "val_acc": val_acc
163 |         }
164 |         
165 |         return self.save(trial, results)
166 | 
167 | 
168 | class CustomTrainer(BaseEncoderTrainer):
169 | 
170 |     def train(self,
171 |               trial,
172 |               scheduler,
173 |               early_stopping, 
174 |               metric = "val_loss"):
175 | 
176 |         flag = isinstance(
177 |             scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
178 | 
179 |         for epoch in range(self.start_epoch, self.epochs):
180 |             
181 |             try:
182 |                 train_loss, train_mse, train_bce, train_accuracy = self.train_one_epoch(epoch)
183 |                 test_loss, test_mse, test_bce, test_accuracy = self.test(epoch)
184 |             
185 |                 if "val_loss" in metric:
186 |                     metric_val = test_loss
187 |                 elif "val_mse_loss" in metric:
188 |                     metric_val = test_mse
189 |                 elif "val_bce_loss" in metric:
190 |                     metric_val = test_bce
191 |                 elif "val_acc" in metric:
192 |                     metric_val = -test_accuracy
193 |                 else:
194 |                     supported = "val_loss, val_mse_loss, val_bce_loss, val_acc"
195 |                     raise ValueError(f"The metric {metric} is not supported. Choose from {supported}")
196 | 
197 |                 trial.report(-metric_val, step=epoch+1)
198 |                 scheduler.step(metric_val if flag else epoch)
199 |                 early_stopping(epoch, metric_val, self.model, self.optimizer)
200 |                 
201 |             except Exception as E: # CUDA memory overflow
202 |                 print(traceback.print_exc())
203 |                 raise optuna.TrialPruned()
204 |             
205 |             if trial.should_prune():
206 |                 raise optuna.TrialPruned()
207 |                 
208 |             if early_stopping.early_stop:
209 |                 break
210 |                 
211 |         return test_loss, test_mse, test_bce, test_accuracy


--------------------------------------------------------------------------------
/aimlutils/echo/examples/keras/model.py:
--------------------------------------------------------------------------------
  1 | import sys 
  2 | import yaml
  3 | import math
  4 | import time
  5 | import random
  6 | import traceback
  7 | import xarray as xr
  8 | import numpy as np
  9 | import pandas as pd
 10 | from datetime import datetime
 11 | 
 12 | import matplotlib.pyplot as plt
 13 | import scipy.sparse
 14 | from scipy.ndimage import gaussian_filter
 15 | 
 16 | from tqdm.auto import tqdm
 17 | 
 18 | import numpy.fft as FFT
 19 | from typing import List, Dict
 20 | 
 21 | from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
 22 | from tensorflow.keras.layers import (Input, Conv2D, Dense, Flatten, 
 23 |                                      MaxPool2D, RepeatVector, Lambda,
 24 |                                      LeakyReLU, Dropout)
 25 | from tensorflow.keras.models import Model, save_model
 26 | from tensorflow.keras.optimizers import Adam, SGD
 27 | import tensorflow.keras.backend as K
 28 | 
 29 | from keras_radam import RAdam
 30 | from keras_radam.training import RAdamOptimizer
 31 | 
 32 | 
 33 | 
 34 | class Conv2DNeuralNetwork(object):
 35 |     """
 36 |     A Conv2D Neural Network Model that can support an arbitrary numbers of
 37 |     layers.
 38 | 
 39 |     Attributes:
 40 |         filters: List of number of filters in each Conv2D layer
 41 |         kernel_sizes: List of kernel sizes in each Conv2D layer
 42 |         conv2d_activation: Type of activation function for conv2d layers
 43 |         pool_sizes: List of Max Pool sizes
 44 |         dense_sizes: Sizes of dense layers
 45 |         dense_activation: Type of activation function for dense layers
 46 |         output_activation: Type of activation function for output layer
 47 |         lr: Optimizer learning rate
 48 |         optimizer: Name of optimizer or optimizer object.
 49 |         adam_beta_1: Exponential decay rate for the first moment estimates
 50 |         adam_beta_2: Exponential decay rate for the first moment estimates
 51 |         sgd_momentum: Stochastic Gradient Descent momentum
 52 |         decay: Optimizer decay
 53 |         loss: Name of loss function or loss object
 54 |         batch_size: Number of examples per batch
 55 |         epochs: Number of epochs to train
 56 |         verbose: Level of detail to provide during training
 57 |         model: Keras Model object
 58 |     """
 59 |     def __init__(
 60 |         self, 
 61 |         filters=(8,), 
 62 |         kernel_sizes=(5,),
 63 |         conv2d_activation="relu", 
 64 |         pool_sizes=(4,), 
 65 |         pool_dropout=0.0,
 66 |         dense_sizes=(64,),
 67 |         dense_activation="relu", 
 68 |         dense_dropout = 0.0,
 69 |         output_activation="linear",
 70 |         lr=0.001, 
 71 |         optimizer="adam", 
 72 |         adam_beta_1=0.9,
 73 |         adam_beta_2=0.999, 
 74 |         sgd_momentum=0.9, 
 75 |         decay=0, 
 76 |         loss="mse",
 77 |         metrics = [], 
 78 |         batch_size=32, 
 79 |         epochs=2, 
 80 |         verbose=0
 81 |     ):
 82 |         
 83 |         self.filters = filters
 84 |         self.kernel_sizes = [tuple((v,v)) for v in kernel_sizes]
 85 |         self.conv2d_activation = conv2d_activation
 86 |         self.pool_sizes = [tuple((v,v)) for v in pool_sizes]
 87 |         self.pool_dropout = pool_dropout
 88 |         self.dense_sizes = dense_sizes
 89 |         self.dense_activation = dense_activation
 90 |         self.dense_dropout = dense_dropout
 91 |         self.output_activation = output_activation
 92 |         self.lr = lr
 93 |         self.optimizer = optimizer
 94 |         self.optimizer_obj = None
 95 |         self.adam_beta_1 = adam_beta_1
 96 |         self.adam_beta_2 = adam_beta_2
 97 |         self.sgd_momentum = sgd_momentum
 98 |         self.decay = decay
 99 |         self.loss = loss
100 |         self.metrics = metrics
101 |         self.batch_size = batch_size
102 |         self.epochs = epochs
103 |         self.verbose = verbose
104 |         self.model = None
105 |         
106 |         if self.conv2d_activation == "leakyrelu":
107 |             self.conv2d_activation = LeakyReLU(alpha=0.1)
108 |         if self.dense_activation == "leakyrelu":
109 |             self.dense_activation = LeakyReLU(alpha=0.1)
110 |         if self.output_activation == "leakyrelu":
111 |             self.output_activation = LeakyReLU(alpha=0.1)
112 | 
113 |     def build_neural_network(self, input_shape, n_particles, output_shape):
114 |         """Create Keras neural network model and compile it."""
115 |         
116 |         # Input
117 |         conv_input = Input(shape=(input_shape), name="input")
118 |         
119 |         # ConvNet encoder
120 |         nn_model = conv_input
121 |         for h in range(len(self.filters)):
122 |             nn_model = Conv2D(self.filters[h],
123 |                               self.kernel_sizes[h],
124 |                               padding="same",
125 |                               activation=self.conv2d_activation,
126 |                               kernel_initializer='he_uniform',
127 |                               name=f"conv2D_{h:02d}")(nn_model)
128 |             nn_model = MaxPool2D(self.pool_sizes[h], padding='same',
129 |                                  name=f"maxpool2D_{h:02d}")(nn_model)
130 |             if self.pool_dropout > 0.0:
131 |                 nn_model = Dropout(self.pool_dropout, 
132 |                                    name = f"maxpool2D_dr_{h:02d}")(nn_model)
133 |         nn_model = Flatten()(nn_model)
134 |         
135 |         # Classifier
136 |         for h in range(len(self.dense_sizes)):
137 |             nn_model = Dense(self.dense_sizes[h],
138 |                              activation=self.dense_activation,
139 |                              kernel_initializer='he_uniform',
140 |                              name=f"dense_{h:02d}")(nn_model)
141 |             if self.dense_dropout > 0.0:
142 |                 nn_model = Dropout(self.dense_dropout, 
143 |                                    name=f"dense_dr_{h:02d}")(nn_model)
144 |         
145 |         # Output
146 |         nn_model = RepeatVector(n_particles, name = "repeat")(nn_model)
147 |         nn_model = Dense(output_shape,
148 |                          activation=self.output_activation,
149 |                          name=f"dense_output")(nn_model)
150 |         nn_model = Lambda(
151 |             self.LastLayer,
152 |             input_shape = (n_particles, output_shape)
153 |         )(nn_model)
154 |         
155 |         self.model = Model(conv_input, nn_model)
156 |         
157 |         if self.optimizer == "adam":
158 |             self.optimizer_obj = Adam(lr=self.lr, clipnorm = 1.0)
159 |         elif self.optimizer == "sgd":
160 |             self.optimizer_obj = SGD(lr=self.lr, momentum=self.sgd_momentum,
161 |                                      decay=self.decay)
162 |             
163 |         self.model.compile(
164 |             optimizer=self.optimizer_obj, 
165 |             loss=self.loss,
166 |             metrics=self.metrics
167 |         )
168 |         #self.model.summary()
169 | 
170 |     def fit(self, x, y, xv=None, yv=None, callbacks=None):
171 |         
172 |         if len(x.shape[1:])==2:
173 |             x = np.expand_dims(x, axis=-1)
174 |         if len(y.shape) == 1:
175 |             output_shape = 1
176 |         else:
177 |             output_shape = y.shape[1]
178 |         
179 |         input_shape = x.shape[1:]
180 |         self.build_neural_network(input_shape, output_shape)
181 |         self.model.fit(x, y, batch_size=self.batch_size, epochs=self.epochs,
182 |                        verbose=self.verbose, validation_data=(xv, yv), callbacks=callbacks)
183 |         return self.model.history.history
184 |     
185 |     def LastLayer(self, x):
186 |         return 1.75 * K.tanh(x / 100) 
187 | 
188 |     def predict(self, x):
189 |         y_out = self.model.predict(np.expand_dims(x, axis=-1),
190 |                                    batch_size=self.batch_size)
191 |         return y_out
192 | 
193 |     def predict_proba(self, x):
194 |         y_prob = self.model.predict(x, batch_size=self.batch_size)
195 |         return y_prob
196 |     
197 |     def load_weights(self, weights):
198 |         try:
199 |             self.model.load_weights(weights)
200 |             self.model.compile(
201 |                 optimizer=self.optimizer, 
202 |                 loss=self.loss, 
203 |                 metrics=self.metrics
204 |             )
205 |         except:
206 |             print("You must first call build_neural_network before loading weights. Exiting.")
207 |             sys.exit(1)
208 | 


--------------------------------------------------------------------------------
/blog/site/memory.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# (12/22/20) Memory profiling python scripts with *memory_profiler*"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "The *memory_profiler* package provides line-by-line output of how much memory is allocated for a process, cell, script, or workflow.\n",
 15 |     "\n",
 16 |     "***\n",
 17 |     "Installing *memory_profiler* is very easy.\n",
 18 |     "\n",
 19 |     "## Pip\n",
 20 |     "\n",
 21 |     "`pip install -U memory_profiler`\n",
 22 |     "\n",
 23 |     "## Conda\n",
 24 |     "\n",
 25 |     "`conda config --add channels conda-forge`\n",
 26 |     "\n",
 27 |     "`conda install memory_profiler`\n",
 28 |     "\n",
 29 |     "***\n",
 30 |     "\n",
 31 |     "*Memory_profiler* isn't just easy to install, it's easy to implement into your scripts, jupyter notebooks, or entire workflows. Below are a few of the many ways *memory_profiler* can be implemented. See documentation [here](https://pypi.org/project/memory-profiler/) and [here](https://github.com/pythonprofilers/memory_profiler).\n"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "## 1. Simple decorator above any script function\n",
 39 |     "\n",
 40 |     "`@profile` above any function like in the script (`example.py`) below. You can set the precision with which the memory usage is reported.\n",
 41 |     "\n",
 42 |     "<img src=\"memory_images/example.py.png\" style=\"width: 250px;\" align=\"center\">\n",
 43 |     "\n",
 44 |     "Run using the following command: `python -m memory_profiler example.py` and you will generate a file called `memory_profiler.log` containing the following output.\n",
 45 |     "\n",
 46 |     "<img src=\"memory_images/example.py_output.png\" style=\"width: 600px;\" align=\"center\">"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "## 2. Decorators above sub-functions\n",
 54 |     "\n",
 55 |     "`@profile` above any sub-functions (`test.py`) called in your main script (`example.py`) below.\n",
 56 |     "\n",
 57 |     "<img src=\"memory_images/test.py.png\" style=\"width: 200px;\" align=\"center\">\n",
 58 |     "<img src=\"memory_images/example.py.png\" style=\"width: 250px;\" align=\"center\">\n",
 59 |     "\n",
 60 |     "`memory_profiler.log` will output the following:\n",
 61 |     "\n",
 62 |     "<img src=\"memory_images/test.py_output.png\" style=\"width: 600px;\" align=\"center\">"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "## 3. Eliminate -m memory_profile flag by importing module into the script\n",
 70 |     "\n",
 71 |     "<img src=\"memory_images/example.py_m.png\" style=\"width: 250px;\" align=\"center\">\n",
 72 |     "<img src=\"memory_images/example.py_m_output.png\" style=\"width: 600px;\" align=\"center\">"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## 4. Track and plot memory as a function of time\n",
 80 |     "\n",
 81 |     "Import and decorate exactly as the above example. Then instead of python, run via  `mprof run example.py`\n",
 82 |     "\n",
 83 |     "<img src=\"memory_images/example.py_m.png\" style=\"width: 250px;\" align=\"center\">\n",
 84 |     "\n",
 85 |     "This method of running will still lead to the familiar output we've seen in previous ways of using *memory_profiler*.\n",
 86 |     "\n",
 87 |     "<img src=\"memory_images/mprofile.dat.png\" style=\"width: 600px;\" align=\"center\">\n",
 88 |     "\n",
 89 |     "However, in addition to the line-by-line summary, the output of  `mprof run` will be saved in a file that begins with `mprofile_` and ends in `.dat`. To create a plot out of this output, run  `mprof plot --output=plot.png` and the following will be created:\n",
 90 |     "\n",
 91 |     "<img src=\"memory_images/mprof_run_plot.png\" style=\"width: 1200px;\" align=\"center\">"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "## 5. A more complicated script example output\n",
 99 |     "\n",
100 |     "<img src=\"memory_images/complex.png\" style=\"width: 700px;\" align=\"center\">\n",
101 |     "\n",
102 |     "<img src=\"memory_images/complex_plot.png\" style=\"width: 500px;\" align=\"center\">"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "## 6. Jupyter notebook importing module\n",
110 |     "\n",
111 |     "When comparing or testing various functions, *memory_profiler* can be used by importing the module's various methods."
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 1,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "from memory_profiler import memory_usage"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 2,
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "name": "stdout",
130 |      "output_type": "stream",
131 |      "text": [
132 |       "[45.046875, 45.046875, 45.046875, 45.046875, 45.046875]\n"
133 |      ]
134 |     }
135 |    ],
136 |    "source": [
137 |     "mem_usage = memory_usage(-1, interval=.2, timeout=1)\n",
138 |     "print(mem_usage)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 3,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "def aaa():\n",
148 |     "    a = [1] * (10 ** 6)\n",
149 |     "    b = [2] * (10 ** 7)\n",
150 |     "    del b\n",
151 |     "    return a"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 4,
157 |    "metadata": {},
158 |    "outputs": [
159 |     {
160 |      "name": "stdout",
161 |      "output_type": "stream",
162 |      "text": [
163 |       "[129.24609375, 129.2578125, 136.890625, 136.890625, 136.890625, 136.890625]\n"
164 |      ]
165 |     }
166 |    ],
167 |    "source": [
168 |     "mem_usage = memory_usage(aaa, interval=.2, timeout=1)\n",
169 |     "print(mem_usage)"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "markdown",
174 |    "metadata": {},
175 |    "source": [
176 |     "## 7. Jupyter notebook magic function\n",
177 |     "\n",
178 |     "Using *memory_profiler* can be as easy as implementing notebok magic functions, as shown below."
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 5,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "%load_ext memory_profiler"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 6,
193 |    "metadata": {},
194 |    "outputs": [
195 |     {
196 |      "name": "stdout",
197 |      "output_type": "stream",
198 |      "text": [
199 |       "peak memory: 136.90 MiB, increment: 0.00 MiB\n"
200 |      ]
201 |     }
202 |    ],
203 |    "source": [
204 |     "%memit range(10000)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 7,
210 |    "metadata": {},
211 |    "outputs": [
212 |     {
213 |      "name": "stdout",
214 |      "output_type": "stream",
215 |      "text": [
216 |       "peak memory: 136.91 MiB, increment: 0.01 MiB\n"
217 |      ]
218 |     }
219 |    ],
220 |    "source": [
221 |     "%memit range(10000000)"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "metadata": {},
227 |    "source": [
228 |     "## 8. Other methods of running and customizing *memory_profiler*\n",
229 |     "\n",
230 |     "The *memory_profile* module can be run via multi-processing and will output the memory usage of child processes. *Memory_profiler* can also be used to debug via a memory threshold. There are various ways of reporting the results of the memory profiling. Finally, there are various ways to customize the output and plotting of *memory_profiler*. For more customization and further running options, please see documentation [here](https://pypi.org/project/memory-profiler/) and [here](https://github.com/pythonprofilers/memory_profiler)."
231 |    ]
232 |   }
233 |  ],
234 |  "metadata": {
235 |   "kernelspec": {
236 |    "display_name": "Python 3",
237 |    "language": "python",
238 |    "name": "python3"
239 |   },
240 |   "language_info": {
241 |    "codemirror_mode": {
242 |     "name": "ipython",
243 |     "version": 3
244 |    },
245 |    "file_extension": ".py",
246 |    "mimetype": "text/x-python",
247 |    "name": "python",
248 |    "nbconvert_exporter": "python",
249 |    "pygments_lexer": "ipython3",
250 |    "version": "3.8.6"
251 |   }
252 |  },
253 |  "nbformat": 4,
254 |  "nbformat_minor": 4
255 | }
256 | 


--------------------------------------------------------------------------------
/blog/site/optuna_mariadb.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# (2/26/21) A short primer on using Optuna and ECHO to interact with a sql database"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### 1. We have access to a MariaDB located on [thunder](https://www2.cisl.ucar.edu/resources/computational-systems/thunder-user-guide)"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "MariaDB: MySQL relational database management system.\n",
 22 |     "\n",
 23 |     "The MariaDB server is accessible from an NCAR IP address, but you cannot login to MariaDB as root remotely. To interact with the database as root, you would need to ssh to thunder and from there you will be able to login to MariaDB as root to setup/manage the database. This will not affect the interaction between optuna and the database, but we will need root in order to manage the database (future).\n",
 24 |     "\n",
 25 |     "In this blog, we have a database named \"optuna\". For demonstrating purposes, we imagine that a user \"icarus\" exists. If you are at NCAR and are experimenting with mysql + optuna, you may email John Schreck about obtaining access. Ordinarily, to get onto thunder, you will use your NCAR password (same as for casper, cheyenne, etc)."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "### 2. Optuna does not have much to say with regards to its sql support. \n",
 33 |     "\n",
 34 |     "In general, this interaction is low-level, while your interaction with optuna is much higher. To that end, the simplest way to go about managing your studies is to use the create_study and delete_study methods. \n",
 35 |     "\n",
 36 |     "You may continue to use the sqlite \"storage\", but be warned that once 1000 trials are saved to the named study, the performance will degrade quickly. This is especially apparent when running the hyperparameter importance metrics, which query the database and train a tree model on the fly."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "### 3. Example: Using create_study and delete_study"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "First, lets see what tables are in the \"optuna\" database on thunder (from terminal):"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "raw",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "mysql -u icarus -p -h thunder.ucar.edu -D optuna -e 'show tables'"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "(I shared an ssh key, hence not having to use Duo. Details at the bottom of this tutorial)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "Next, lets list the study names user \"schreck\" has saved into optuna: "
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "raw",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "mysql -u icarus -p -h thunder.ucar.edu -D optuna -e 'select * from studies'"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "Now we create a new study named \"example\":"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 1,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "import optuna"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 2,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "study = optuna.create_study(\n",
104 |     "    study_name=\"example\", \n",
105 |     "    storage=\"mysql://icarus:password@thunder.ucar.edu/optuna\"\n",
106 |     ")"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "Confirm that the study was actually created by repeating the command from earlier: "
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "raw",
118 |    "metadata": {},
119 |    "source": [
120 |     "mysql -u icarus -p -h thunder.ucar.edu -D optuna -e 'select * from studies'"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "Next, in your hyperparameters.yml configuration file, we simply point to the database as follows under the optuna field:"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 3,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "study_name: \"example\"\n",
137 |     "storage: \"mysql://icarus:password@thunder.ucar.edu/optuna\""
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "You don't have to worry about entering your sql password, it is already contained in the storage link! Since we are on an NCAR server, we also do not need to use Duo, although this will be changing in the near future. The forth-coming additional security will likely become problematic, but we will deal with that later. "
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "Note that you don't have to create a study beforehand if it does not exist, the optimize.py script that is used to launch a hyperparameter study, contained in the [ECHO](https://github.com/NCAR/aiml-utils/tree/master/aimlutils/echo) package, will call create_study for you:"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "raw",
156 |    "metadata": {},
157 |    "source": [
158 |     "python $echo/optimize.py hyperparameter.yml model.yml"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "For now, when its time to delete a study from our optuna database, simply call the optuna method delete_study:"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 4,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "optuna.delete_study(\n",
175 |     "    study_name=\"example\", \n",
176 |     "    storage=\"mysql://icarus:password@thunder.ucar.edu/optuna\"\n",
177 |     ")"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {},
183 |    "source": [
184 |     "Let us double check that it was actaully removed:"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "raw",
189 |    "metadata": {},
190 |    "source": [
191 |     "mysql -u icarus -p -h thunder.ucar.edu -D optuna -e 'select * from studies'"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "metadata": {},
197 |    "source": [
198 |     "Ordinarily, you set reload = 0 in your hyperparameters.yml file when starting a new study. If the study name already exists, optimize.py/run.py will fail with an error message (I will not delete or overwrite things automatically. That job is left up to you).\n",
199 |     "\n",
200 |     "When using the sqlite database solution, you simply delete that file. For sql support, the script will still complain at you, but a new parser option has been added that will facilitate the delete_study call:"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "raw",
205 |    "metadata": {},
206 |    "source": [
207 |     "The study {study_name} already exists in storage and reload was False.\n",
208 |     "Delete it from {storage}, and try again or rerun this script\n",
209 |     "with the flag: --override 1"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {},
215 |    "source": [
216 |     "E.g. you run:"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "raw",
221 |    "metadata": {},
222 |    "source": [
223 |     "python $echo/optimize.py hyperparameter.yml model.yml --override 1"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {},
229 |    "source": [
230 |     "And the study_name will be deleted from the storage container. Note that its gone forever, so be extra careful that this is what you intended. "
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "markdown",
235 |    "metadata": {},
236 |    "source": [
237 |     "### 4. For more, checkout [this tutorial](https://www.guru99.com/data-warehousing-tutorial.html) on data warehousing."
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "Feel free to email me (John Schreck, schreck@ucar.edu) with any questions / mistakes / whatever!"
245 |    ]
246 |   }
247 |  ],
248 |  "metadata": {
249 |   "kernelspec": {
250 |    "display_name": "Python 3",
251 |    "language": "python",
252 |    "name": "python3"
253 |   },
254 |   "language_info": {
255 |    "codemirror_mode": {
256 |     "name": "ipython",
257 |     "version": 3
258 |    },
259 |    "file_extension": ".py",
260 |    "mimetype": "text/x-python",
261 |    "name": "python",
262 |    "nbconvert_exporter": "python",
263 |    "pygments_lexer": "ipython3",
264 |    "version": "3.8.6"
265 |   }
266 |  },
267 |  "nbformat": 4,
268 |  "nbformat_minor": 4
269 | }
270 | 


--------------------------------------------------------------------------------
/aimlutils/echo/report.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | warnings.filterwarnings("ignore")
  3 | 
  4 | import os
  5 | import sys
  6 | import yaml
  7 | import optuna
  8 | import logging
  9 | import pandas as pd
 10 | import matplotlib as mpl
 11 | import matplotlib.pyplot as plt
 12 | from argparse import ArgumentParser
 13 | from typing import Dict
 14 | 
 15 | 
 16 | def args():
 17 |     parser = ArgumentParser(description=
 18 |         "report.py: Get the status/progress of a hyperparameter study"
 19 |     )
 20 | 
 21 |     parser.add_argument("hyperparameter", type=str, help=           
 22 |             "Path to the hyperparameter configuration containing your inputs."
 23 |     )
 24 | 
 25 |     parser.add_argument(
 26 |         "-p",
 27 |         "--plot", 
 28 |         dest="plot", 
 29 |         type=str,
 30 |         default=False, 
 31 |         help="A yaml structured file containining settings for matplotlib/pylab objects"
 32 |     )
 33 |     
 34 |     parser.add_argument(
 35 |         "-t",
 36 |         "--n_trees", 
 37 |         dest="n_trees", 
 38 |         type=int,
 39 |         default=64, 
 40 |         help="The number of trees to use in parameter importance models. Default is 64."
 41 |     )
 42 |     
 43 |     parser.add_argument(
 44 |         "-d",
 45 |         "--max_depth", 
 46 |         dest="max_depth", 
 47 |         type=int,
 48 |         default=64, 
 49 |         help="The maximum depth to use in parameter importance models. Default is 64."
 50 |     )
 51 |         
 52 |     return vars(parser.parse_args())
 53 |     
 54 | 
 55 | def update_figure(fig: mpl.figure.Figure, 
 56 |                   params: Dict[str, str] = False) -> mpl.figure.Figure:
 57 |     """
 58 |     Updates some mpl Figure parameters. Only limited support for now.
 59 |     In a future version the optuna plots will be moved here 
 60 |     and expanded customization will be enabled.
 61 |     
 62 |     Returns a matplotlib Figure
 63 |     
 64 |     Inputs: 
 65 |         fig: a matplotlib Figure
 66 |         params: a dictionary containing mpl fields
 67 |     """
 68 |     
 69 |     if params is False:
 70 |         fig.set_yscale("log")
 71 |         mpl.rcParams.update({"figure.dpi": 300})
 72 |     else:
 73 |         if "rcparams" in params:
 74 |             mpl.rcParams.update(**params["rcparams"])   
 75 |         if "set_xlim" in params:
 76 |             fig.set_xlim(params["set_xlim"])
 77 |         if "set_ylim" in params:
 78 |             fig.set_ylim(params["set_ylim"])
 79 |         if "set_xscale" in params:
 80 |             fig.set_xscale(params["set_xscale"])
 81 |         if "set_yscale" in params:
 82 |             fig.set_yscale(params["set_yscale"])
 83 |             
 84 |     plt.tight_layout()    
 85 |     return fig
 86 | 
 87 | 
 88 | def plot_wrapper(study: optuna.study.Study,
 89 |                  identifier: str,
 90 |                  save_path: str, 
 91 |                  params: Dict[str, str] = False):
 92 |     
 93 |     
 94 |     """
 95 |     Creates and saves an intermediate values plot.
 96 |     
 97 |     Does not return. 
 98 |     
 99 |     Inputs: 
100 |         study: an Optuna study object
101 |         identifier: a string identifier for selecting the optuna plot method
102 |         save_path: a path where the plot should be saved
103 |         params: a dictionary containing mpl fields. Default = False
104 |     """
105 |     
106 |     flag = isinstance(params, dict)
107 |     if flag and identifier in params:
108 |         params = params[identifier]
109 |     else:
110 |         flag = False
111 |     
112 |     # Use optunas mpl object for now
113 |     if identifier == "intermediate_values":
114 |         fig = optuna.visualization.matplotlib.plot_intermediate_values(study)
115 |     elif identifier == "optimization_history":
116 |         fig = optuna.visualization.matplotlib.plot_optimization_history(study)
117 |     elif identifier == "pareto_front":
118 |         fig = optuna.multi_objective.visualization.plot_pareto_front(study)
119 |     else:
120 |         raise OSError(f"An incorrect optuna plot identifier {identifier} was used")
121 |         
122 |     fig = update_figure(fig, params)
123 | 
124 |     if flag and "save_path" in params:
125 |         save_path = params["save_path"]
126 |         
127 |     figure_save_path = os.path.join(save_path, f"{identifier}.pdf")
128 |     plt.savefig(figure_save_path)
129 |         
130 |     logging.info(
131 |         f"Saving the {identifier} plot to file at {figure_save_path}"
132 |     )
133 |     
134 | 
135 | if __name__ == "__main__":
136 |     
137 |     if len(sys.argv) < 2:
138 |         raise OSError(
139 |             "Usage: python report.py hyperparameter.yml [optional arguments]"
140 |             "To see the available parser options: python report.py --help"
141 |         )
142 |     
143 |     args_dict = args()
144 | 
145 |     hyper_config = args_dict.pop("hyperparameter")
146 |     plot_config = args_dict.pop("plot") if "plot" in args_dict else False
147 |     
148 |     # Options for the parameter importance tree models
149 |     n_trees = args_dict.pop("n_trees")
150 |     max_depth = args_dict.pop("max_depth")
151 | 
152 |     # Check if hyperparameter config file exists
153 |     if os.path.isfile(hyper_config):
154 |         with open(hyper_config) as f:
155 |             hyper_config = yaml.load(f, Loader=yaml.FullLoader)
156 |     else:
157 |         raise OSError(
158 |             f"Hyperparameter optimization config file {hyper_config} does not exist"
159 |         )
160 |         
161 |     if plot_config is not False:
162 |         if os.path.isfile(plot_config):
163 |             with open(plot_config) as p:
164 |                 plot_config = yaml.load(p, Loader=yaml.FullLoader)
165 |         else:
166 |             raise OSError(
167 |             f"Hyperparameter optimization plot file {plot_config} does not exist"
168 |             )
169 |         
170 |     
171 |     # Set up a logger
172 |     root = logging.getLogger()
173 |     root.setLevel(logging.INFO)
174 |     formatter = logging.Formatter('%(levelname)s:%(name)s:%(message)s')
175 | 
176 |     # Stream output to stdout
177 |     ch = logging.StreamHandler()
178 |     ch.setLevel(logging.INFO)
179 |     ch.setFormatter(formatter)
180 |     root.addHandler(ch)
181 | 
182 |     save_path = hyper_config["optuna"]["save_path"]
183 |     study_name = hyper_config["optuna"]["study_name"]
184 |     storage = hyper_config["optuna"]["storage"]
185 |     reload_study = bool(hyper_config["optuna"]["reload"])
186 |     cached_study = f"{save_path}/{study_name}"
187 | 
188 |     direction = hyper_config["optuna"]["direction"]
189 |     single_objective = isinstance(direction, str)
190 | 
191 |     # Load from database
192 |     #storage = f'postgresql+psycopg2://john:schreck@localhost/{cached_study}'
193 |     #storage = f"sqlite:///{cached_study}"
194 | 
195 |     if single_objective:
196 |         study = optuna.load_study(study_name=study_name, storage=storage)
197 |     else:
198 |         study = optuna.multi_objective.study.load_study(
199 |             study_name=study_name, 
200 |             storage=storage
201 |         )
202 | 
203 |     # Check a few other stats
204 |     pruned_trials = [
205 |         t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED
206 |     ]
207 |     complete_trials = [
208 |         t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE
209 |     ]
210 | 
211 |     logging.info(f'Number of requested trials per worker: {hyper_config["optuna"]["n_trials"]}')
212 |     logging.info(f"Number of trials in the database: {len(study.trials)}")
213 |     logging.info(f"Number of pruned trials: {len(pruned_trials)}")
214 |     logging.info(f"Number of completed trials: {len(complete_trials)}")
215 | 
216 |     if len(complete_trials) == 0:
217 |         logging.info("There are no complete trials in this study.")
218 |         logging.info("Wait until the workers finish a few trials and try again.")
219 |         sys.exit()
220 | 
221 |     logging.info(f"Best trial: {study.best_trial.value}")
222 | 
223 |     if len(complete_trials) > 1:
224 |         f_importance = optuna.importance.FanovaImportanceEvaluator(
225 |             n_trees = n_trees, max_depth = max_depth).evaluate(study=study)
226 |         logging.info(f"fANOVA parameter importance {dict(f_importance)}")
227 |         mdi_importance = optuna.importance.MeanDecreaseImpurityImportanceEvaluator(
228 |             n_trees = n_trees, max_depth = max_depth).evaluate(study=study)
229 |         logging.info(f"Mean decrease impurity (MDI) parameter importance {dict(mdi_importance)}")
230 | 
231 |     logging.info("Best parameters in the study:")
232 |     for param, val in study.best_params.items():
233 |         logging.info(f"{param}: {val}")
234 | 
235 |     if len(study.trials) < hyper_config["optuna"]["n_trials"]:
236 |         logging.warning(
237 |             "Not all of the trials completed due to the wall-time."
238 |         )
239 |         logging.warning(
240 |             "Set reload = 1 in the hyperparameter config and resubmit some more workers to finish!"
241 |         )
242 | 
243 |     save_fn = os.path.join(save_path, f"{study_name}.csv")
244 |     logging.info(f"Saving the results of the study to file at {save_fn}")
245 |     study.trials_dataframe().to_csv(save_fn, index = None)
246 | 
247 |     if single_objective:
248 |         
249 |         # Plot the optimization_history
250 |         plot_wrapper(study, "optimization_history", save_path, plot_config)
251 | 
252 |         # Plot the intermediate_values
253 |         plot_wrapper(study, "intermediate_values", save_path, plot_config)
254 | 
255 |     else:
256 |         # Plot the pareto front
257 |         plot_wrapper(study, "pareto_front", save_path, plot_config)
258 |         


--------------------------------------------------------------------------------
/aimlutils/echo/run.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | warnings.filterwarnings("ignore")
  3 | 
  4 | from aimlutils.echo.src.samplers import samplers
  5 | from aimlutils.utils.gpu import gpu_report
  6 | import importlib.machinery
  7 | import pandas as pd
  8 | import numpy as np
  9 | import logging
 10 | import optuna
 11 | import time
 12 | import glob
 13 | import yaml
 14 | import sys
 15 | import os
 16 | 
 17 | start_the_clock = time.time()
 18 | 
 19 | 
 20 | def get_sec(time_str):
 21 |     """Get Seconds from time."""
 22 |     h, m, s = time_str.split(':')
 23 |     return int(h) * 3600 + int(m) * 60 + int(s)
 24 | 
 25 | # References
 26 | # https://github.com/optuna/optuna/issues/1365
 27 | # https://docs.dask.org/en/latest/setup/hpc.html
 28 | # https://dask-cuda.readthedocs.io/en/latest/worker.html
 29 | # https://optuna.readthedocs.io/en/stable/tutorial/004_distributed.html#distributed
 30 | 
 31 | if len(sys.argv) != 3:
 32 |     print(
 33 |         "Usage: python run.py hyperparameter.yml model.yml"
 34 |     )
 35 |     sys.exit()
 36 | 
 37 | # Set up a logger
 38 | root = logging.getLogger()
 39 | root.setLevel(logging.DEBUG)
 40 | formatter = logging.Formatter('%(levelname)s:%(name)s:%(message)s')
 41 | 
 42 | # Stream output to stdout
 43 | ch = logging.StreamHandler()
 44 | ch.setLevel(logging.INFO)
 45 | ch.setFormatter(formatter)
 46 | root.addHandler(ch)
 47 | 
 48 | ################################################################
 49 | 
 50 | # Check if hyperparameter config file exists
 51 | if os.path.isfile(sys.argv[1]):
 52 |     with open(sys.argv[1]) as f:
 53 |         hyper_config = yaml.load(f, Loader=yaml.FullLoader)
 54 | else:
 55 |     raise OSError(
 56 |         f"Hyperparameter optimization config file {sys.argv[1]} does not exist"
 57 |     )
 58 |     
 59 | # Check if the wall-time exists
 60 | if "slurm" in hyper_config:
 61 |     if "t" not in hyper_config["slurm"]["batch"]:
 62 |         raise OSError(
 63 |             "You must supply a wall time in the hyperparameter config at slurm:batch:t"
 64 |         )
 65 | if "pbs" in hyper_config:
 66 |     if not any([("walltime" in x) for x in hyper_config["pbs"]["batch"]["l"]]):
 67 |         raise OSError(
 68 |             "You must supply a wall time in the hyperparameter config at pbs:bash:l"
 69 |         )
 70 |         
 71 | # Check if model config file exists
 72 | if os.path.isfile(sys.argv[2]):
 73 |     with open(sys.argv[2]) as f:
 74 |         model_config = yaml.load(f, Loader=yaml.FullLoader)
 75 | else:
 76 |     raise OSError(
 77 |         f"Model config file {sys.argv[1]} does not exist"
 78 |     )
 79 |     
 80 | # Copy the optuna details to the model config
 81 | model_config["optuna"] = hyper_config["optuna"] 
 82 |     
 83 | # Check if path to objective method exists
 84 | if os.path.isfile(model_config["optuna"]["objective"]):
 85 |     loader = importlib.machinery.SourceFileLoader(
 86 |         "custom_objective", 
 87 |         model_config["optuna"]["objective"]
 88 |     )
 89 |     mod = loader.load_module()
 90 |     from custom_objective import Objective
 91 | else:
 92 |     raise OSError(
 93 |         f'The objective file {model_config["optuna"]["objective"]}\
 94 |         does not exist'
 95 |     )
 96 |     
 97 | # Check if the optimization metric direction is supported
 98 | direction = model_config["optuna"]["direction"]
 99 | single_objective = isinstance(direction, str)
100 | 
101 | if single_objective:
102 |     if direction not in ["maximize", "minimize"]:
103 |         raise OSError(
104 |             f"Optimizer direction {direction} not recognized. \
105 |             Choose from maximize or minimize"
106 |         )
107 | else:
108 |     for direc in direction:
109 |         if direc not in ["maximize", "minimize"]:
110 |             raise OSError(
111 |             f"Optimizer direction {direc} not recognized. \
112 |             Choose from maximize or minimize"
113 |         )
114 | 
115 | logging.info(f"Direction of optimization {direction}")
116 |     
117 | ### Add other config checks
118 | 
119 | ################################################################
120 |       
121 | # Stream output to log file
122 | if "log" in hyper_config:
123 |     savepath = hyper_config["log"]["save_path"] if "save_path" in hyper_config["log"] else "log.txt"
124 |     mode = "a+" if bool(hyper_config["optuna"]["reload"]) else "w"
125 |     fh = logging.FileHandler(savepath,
126 |                              mode=mode,
127 |                              encoding='utf-8')
128 |     fh.setLevel(logging.DEBUG)
129 |     fh.setFormatter(formatter)
130 |     root.addHandler(fh)
131 |     
132 | # Get the path to save all the data
133 | save_path = model_config["optuna"]["save_path"]
134 | logging.info(f"Saving optimization details to {save_path}")
135 |     
136 | # Grab the metric
137 | if isinstance(model_config["optuna"]["metric"], list):
138 |     metric = [str(m) for m in model_config["optuna"]["metric"]]
139 | else:
140 |     metric = str(model_config["optuna"]["metric"])
141 | logging.info(f"Using metric {metric}")
142 | 
143 | # Get list of devices and initialize the Objective class
144 | if bool(model_config["optuna"]["gpu"]):
145 |     try:
146 |         gpu_report = sorted(
147 |             gpu_report().items(), 
148 |             key = lambda x: x[1], 
149 |             reverse = True
150 |         )
151 |         device = gpu_report[0][0]
152 |     except:
153 |         logging.warning(
154 |             "The gpu is not responding to a call from nvidia-smi.\
155 |             Setting gpu device = 0, but this may fail."
156 |         )
157 |         device = 0
158 | else:
159 |     device = 'cpu'
160 | logging.info(f"Using device {device}")
161 | 
162 | ################################################################
163 | 
164 | # Initialize the study object
165 | study_name = model_config["optuna"]["study_name"]
166 | reload_study = bool(model_config["optuna"]["reload"])
167 | 
168 | # cached_study = f"{save_path}/{study_name}"
169 | 
170 | # if not os.path.isfile(cached_study) or not reload_study:
171 | #     load_if_exists = False
172 | # elif not reload_study:
173 | #     os.remove(cached_study)
174 | #     load_if_exists = reload_study
175 | # else:
176 | #     load_if_exists = True
177 | 
178 | # Identify the storage location
179 | storage = model_config["optuna"]["storage"] #f"sqlite:///{cached_study}"
180 | 
181 | # Initialize the sampler
182 | if "sampler" not in hyper_config["optuna"]:
183 |     if single_objective: # single-objective
184 |         sampler = optuna.samplers.TPESampler()
185 |     else: # multi-objective equivalent of TPESampler
186 |         sampler = optuna.multi_objective.samplers.MOTPEMultiObjectiveSampler()
187 | else:
188 |     sampler = samplers(hyper_config["optuna"]["sampler"])
189 | 
190 | # Load or initiate study
191 | if single_objective:
192 |     study = optuna.create_study(study_name=study_name,
193 |                                 storage=storage,
194 |                                 sampler=sampler,
195 |                                 direction=direction,
196 |                                 load_if_exists=True)
197 | else:
198 |     study = optuna.multi_objective.study.create_study(
199 |         study_name=study_name,
200 |         storage=storage,
201 |         sampler=sampler,
202 |         directions=direction,
203 |         load_if_exists=True
204 |     )
205 | logging.info(f"Loaded study {study_name} located at {storage}")
206 | 
207 | # Initialize objective function
208 | objective = Objective(model_config, metric, device)
209 | 
210 | # Optimize it
211 | logging.info(
212 |     f'Running optimization for {model_config["optuna"]["n_trials"]} trials'
213 | )
214 |     
215 | # Get the cluster job wall-time
216 | if "slurm" in hyper_config:
217 |     wall_time = hyper_config["slurm"]["batch"]["t"]
218 | elif "pbs" in hyper_config:
219 |     wall_time = False
220 |     for option in hyper_config["pbs"]["batch"]["l"]:
221 |         if "walltime" in option:
222 |             wall_time = option.split("walltime=")[-1]
223 |             break
224 |     if wall_time is False:
225 |         logging.warning("Could not process the walltime for run.py. Assuming 12 hours.")
226 |         wall_time = "12:00:00"
227 | wall_time_secs = get_sec(wall_time)
228 | 
229 | logging.info(
230 |     f"This script will run for a fraction of the wall-time of {wall_time} and try to die without error"
231 | )
232 | 
233 | run_times = []
234 | estimated_run_time = wall_time_secs
235 | 
236 | # study.optimize(
237 | #     objective, 
238 | #     n_trials = int(model_config["optuna"]["n_trials"]), 
239 | #     timeout = estimated_run_time,
240 | #     catch = (ValueError,)
241 | # )
242 | 
243 | # Testing out way to stop running trials if too close to wall-time. 
244 | # Update to computing the mean of the run times of all completed trials in the database.
245 | 
246 | for iteration in range(int(model_config["optuna"]["n_trials"])):
247 |     
248 |     try:
249 |         start_time = time.time()
250 |         study.optimize(
251 |             objective, 
252 |             n_trials = 1, 
253 |             timeout = estimated_run_time,
254 |             #catch = (ValueError,) 
255 |         )
256 |         end_time = time.time()
257 |         run_times.append(end_time - start_time)
258 |         
259 |     except KeyboardInterrupt:
260 |         logging.warning(
261 |                 f"Recieved signal to die from keyboard. Exiting."
262 |             )
263 |         break
264 |     
265 |     except Exception as E:
266 |         logging.warning(
267 |                 f"Dying early due to error {E}"
268 |             )
269 |         break
270 |     
271 |     if len(run_times) > 1:
272 |         average_run_time = np.mean(run_times)
273 |         sigma_run_time = np.std(run_times) if len(run_times) > 2 else 0.0
274 |         estimated_run_time = average_run_time + 2 * sigma_run_time
275 |         time_left = wall_time_secs - (time.time() - start_the_clock)
276 |         if time_left < estimated_run_time:
277 |             logging.warning(
278 |                 f"Dying early as estimated run-time exceeds the time remaining on this node."
279 |             )
280 |             break


--------------------------------------------------------------------------------
/aimlutils/echo/README.ipynb:
--------------------------------------------------------------------------------
  1 | # hyper_opt: A distributed multi-gpu hyperparameter optimization package build with optuna
  2 | 
  3 | ### Usage 
  4 | 
  5 | python optimize.py hyperparameters.yml model.yml
  6 | 
  7 | ### Dependencies
  8 | 
  9 | There are three files that must be supplied to use the optimize script:
 10 | 
 11 | * A custom objective function that performs the model training and returns the metric value to be optimized.
 12 | 
 13 | * A configuration file specifying the hyperparameter optimization settings.
 14 | 
 15 | * A model configuration file that contains the available hyperparameters that will get optimized.
 16 | 
 17 | ### Custom objective class
 18 | The user must supply a custom **Objective** class (objective.py) that is composed with an internal **BaseObjective** class (base_objective.py), and contains a method named **train** that returns the value of the optimization metric in a dictionary. See the examples directory for both torch and Keras examples. Note that the objective class only needs to return the metric value (in dictionary form) and does not depend on the machine learning library used. For example, a simple Objective class template will have the following structure:
 19 | 
 20 |     from aimlutils.hyper_opt.base_objective import *
 21 | 
 22 |     class Objective(BaseObjective):
 23 | 
 24 |         def __init__(self, study, config, metric = "val_loss", device = "cpu"):
 25 | 
 26 |             # Initialize the base class
 27 |             BaseObjective.__init__(self, study, config, metric, device)
 28 | 
 29 |         def train(self, trial, conf):
 30 | 
 31 |             # Make any custom edits to the model conf before using it to train a model.
 32 |             conf = custom_updates(trial, conf)
 33 | 
 34 |             ... 
 35 | 
 36 |             result = Model.fit(...)
 37 | 
 38 |             results_dictionary = {
 39 |                 "val_loss": result["val_loss"],
 40 |                 "loss": result["loss"],
 41 |                 ...
 42 |                 "val_accuracy": result["val_accuracy"]
 43 |             }
 44 |             return results_dictionary
 45 |         
 46 | The BaseObjective must be initialized using the input parameters to the Objective (they must match!). The metric used to toggle model performance must always be in the results dictionary, while other metrics that the user may want to track will also be stored and saved so long as they are included in the results dictionary. The base class will call the train method from its thunder **__call__** method, and finishes up by calling a save method that takes care of writing the metric(s) details to file. Check out the script run.py to see how things are called.
 47 | 
 48 | Note that the first line in the train method states that any custom changes to the model configuration (conf) must be done here. If custom changes are required, the user must supply a method named **custom_updates** in addition to the Objective class (save both in the same script). See also the section **Custom configuration edits** below for more details. 
 49 | 
 50 | ### Hyperparameter optimizer configuration 
 51 | There are three main fields, log, slurm, and optuna, and variable subfields within each field. The log field allows us to save a file for printing messages and warnings that are placed in areas throughout the package. The slurm field allows the user to specify how many GPU nodes should be used, and supports any slurm setting. The optuna field allows the user to configure the optimization procedure, including specifying which parameters will be used, as well as the performance metric. For example, consider the configuration settings:
 52 | 
 53 | * log
 54 |   + save_path: "path/to/data/log.txt"
 55 | * slurm
 56 |   + jobs: 20
 57 |   + batch:
 58 |     + account: "NAML0001"
 59 |     + gres: "gpu:v100:1"
 60 |     + mem: "128G"
 61 |     + n: 8
 62 |     + t: "12:00:00"
 63 |     + J: "hyper_opt"
 64 |     + o: "hyper_opt.out"
 65 |     + e: "hyper_opt.err"
 66 | * optuna
 67 |   + name: "holodec_optimization.db"
 68 |   + reload: 0
 69 |   + objective: "examples/torch_objective.py"
 70 |   + metric: "val_loss"
 71 |   + direction: "minimize"
 72 |   + n_trials: 500
 73 |   + gpu: True
 74 |   + save_path: 'test'
 75 |   + sampler:
 76 |     + type: "TPESampler"
 77 |   + parameters:
 78 |     + num_dense:
 79 |       + type: "int"
 80 |       + settings:
 81 |         + name: "num_dense"
 82 |         + low: 0
 83 |         + high: 10
 84 |     + dropout:
 85 |       + type: "float"
 86 |       + settings:
 87 |         + name: "dr"
 88 |         + low: 0.0
 89 |         + high: 0.5
 90 |     + **optimizer:learning_rate**:
 91 |       + type: "loguniform"
 92 |       + settings:
 93 |         + name: "lr"
 94 |         + low: 0.0000001
 95 |         + high: 0.01
 96 | 
 97 | The subfields within the optuna field have the following functionality:
 98 | 
 99 | * name: ($\color{red}{string}$) The name of the study.
100 | * reload: ($\color{red}{bool}$) Whether to continue using a previous study (True) or to initialize a new study (False). If your initial number of workers do not reach the number of trials and you wish to resubmit, set to True.
101 | * objective: ($\color{red}{string}$) The path to the user-supplied objective class (it must be named objective.py)
102 | * metric: ($\color{red}{string}$) The metric to be used to determine the model performance. 
103 | * direction: ($\color{red}{string}$) Indicates which direction the metric must go to represent improvement (pick from maximimize or minimize)
104 | * n_trials: ($\color{red}{int}$) The number of trials in the study.
105 | * gpu: ($\color{red}{bool}$) Use the gpu or cpu.
106 | * save_path: ($\color{red}{string}$) Directory path where data will be saved. 
107 | * sampler
108 |   + type: ($\color{red}{string}$) Choose how optuna will do parameter estimation. The default choice both here and in optuna is the [Tree-structured Parzen Estimator Approach](https://towardsdatascience.com/a-conceptual-explanation-of-bayesian-model-based-hyperparameter-optimization-for-machine-learning-b8172278050f), [e.g. TPESampler](https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf). See the optuna documentation for the different options. For some samplers (e.g. GridSearch) additional fields may be included (e.g. search_space). 
109 | * parameters
110 |   + type: ($\color{red}{string}$) Option to select an optuna trial setting. See the [optuna Trial documentation](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html?highlight=suggest#optuna.trial.Trial.suggest_uniform) for what is available. Currently, this package supports the available options from optuna: "categorical", "discrete_uniform", "float", "int", "loguniform", and "uniform".
111 |   + settings: This field allows you to specify any settings that accompany the optuna trial type. In the example above, the named num_dense parameter is stated to be an integer with values ranging from 0 to 10. To see all the available options, consolt the [optuna Trial documentation](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html?highlight=suggest#optuna.trial.Trial.suggest_uniform)
112 | 
113 | ### Model configuration
114 | The model configuration file should be the one you have been using up to this point to train models. This package will take the suggested hyperparameters from an optuna trial and make changes to the model configuration. This can either be done automatically with this package, or the user may supply an additional method for making custom changes. For example, consider the (truncated) configuration for training a model to predict hologram properties with the holodec data:
115 | 
116 | * model:
117 |   + image_channels: 1
118 |   + hidden_dims: [3, 94, 141, 471, 425, 1122]
119 |   + z_dim: 1277
120 |   + dense_hidden_dims: [1000]
121 |   + dense_dropouts: [0.0]
122 |   + tasks: ["x", "y", "z", "d", "binary"]
123 | + **optimizer**:
124 |   * type: "lookahead-diffgrad"
125 |   * **learning_rate**: 0.000631
126 |   * weight_decay: 0.0
127 | + trainer:
128 |   * start_epoch: 0
129 |   * epochs: 1
130 |   * clip: 1.0
131 |   * alpha: 1.0
132 |   * beta: 0.1
133 |   * path_save: "test"
134 |   
135 | The model configuration can be automatically updated using this package if the name of the parameter specified in the hyperparameter configuration, optuna.parameters can be used as a nested lookup key in the model configuration's nested dictionary. For example, observe in the hyperparameter configuration file that the named parameter **optimizer:learning_rate** contains a colon, that is downstream used to split the name into multiple keys that allow us to, starting from the top of the nested tree in the model configuration, work our way down until the field is located and the trial-suggested value is substituted in. In this example, the split keys are ["optimizer", "learning_rate"]. 
136 | 
137 | This scheme will work in general as long as the named parameter in optuna.parameters uses : as the separator, and once split, the resulting list can be used to locate the relevant field in the model configuration.
138 | 
139 | 
140 | ### Custom configuration edits
141 | 
142 | The user can also supply rules for updating the model configuration file, by including a method named **custom_updates**, which will make the desired changes to the configuration file with optuna trail parameter guesses.
143 | 
144 | In the example configurations described above, the hyperparameter configuration contained an optuna.parameters field "num_dense," but this field is not present in the model configuration. There is however a dense_hiddden_dims field in the model configuration that contains a list of the layer sizes in the model (where the number of layers is the length of the list). In the example, just one layer speficied but we want to vary that number. To use the num_dense hyperparameter from the hyperparameter configuration file, we need to create the following custom method:
145 | 
146 |     def custom_updates(trial, conf):
147 |     
148 |         # Get list of hyperparameters from the config
149 |         hyperparameters = conf["optuna"]["parameters"]
150 |     
151 |         # Now update some via custom rules
152 |         num_dense = trial.suggest_discrete_uniform(**hyperparameters["num_dense"]) 
153 |     
154 |         # Update the config based on optuna's suggestion
155 |         conf["model"]["dense_hidden_dims"] = [1000 for k in range(num_dense)]        
156 |         
157 |         return conf 
158 |         
159 | This custom method should be called first thing in the custom Objective.train method. You may have noticed that the configuration (named conf) contains both hyperparameter and model fields. This package will copy the hyperparameter fields to the model configuration for convenience, so that we can reduce the total number of class and method dependencies (which helps me keep the code generalized). This occurs in the run.py script.


--------------------------------------------------------------------------------
/aimlutils/torch/optimizers/optimizers.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import logging
  4 | import torch.nn as nn
  5 | import itertools as it
  6 | from typing import Dict
  7 | 
  8 | 
  9 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 10 | 
 11 | 
 12 | def LoadOptimizer(optimizer_type: str, parameters: Dict[str, float], learning_rate: float = 0.001, weight_decay=0.0):
 13 | 
 14 |     if optimizer_type == "lookahead-diffgrad":
 15 |         optimizer = LookaheadDiffGrad(
 16 |             parameters, lr=learning_rate, weight_decay=weight_decay)
 17 |     elif optimizer_type == "diffgrad":
 18 |         optimizer = DiffGrad(parameters, lr=learning_rate,
 19 |                              weight_decay=weight_decay)
 20 |     elif optimizer_type == "lookahead-radam":
 21 |         optimizer = LookaheadRAdam(
 22 |             parameters, lr=learning_rate, weight_decay=weight_decay)
 23 |     elif optimizer_type == "radam":
 24 |         optimizer = RAdam(parameters, lr=learning_rate,
 25 |                           weight_decay=weight_decay)
 26 |     elif optimizer_type == "adam":
 27 |         optimizer = torch.optim.Adam(
 28 |             parameters, lr=learning_rate, weight_decay=weight_decay)
 29 |     elif optimizer_type == "sgd":
 30 |         optimizer = torch.optim.SGD(
 31 |             parameters, lr=learning_rate, weight_decay=weight_decay)
 32 |     else:
 33 |         logging.warning(
 34 |             f"Optimzer type {optimizer_type} is unknown. Exiting with error."
 35 |         )
 36 |         sys.exit(1)
 37 | 
 38 |     logger.info(
 39 |         f"Loaded the {optimizer_type} optimizer with learning rate {learning_rate} and L2 penalty {weight_decay}"
 40 |     )
 41 |     return optimizer
 42 | 
 43 | 
 44 | class DiffGrad(torch.optim.Optimizer):
 45 |     # Original source:  https://github.com/shivram1987/diffGrad/blob/master/diffGrad.py
 46 |     r"""Implements diffGrad algorithm. It is modified from the pytorch implementation of Adam.
 47 |     It has been proposed in `diffGrad: An Optimization Method for Convolutional Neural Networks`_.
 48 |     Arguments:
 49 |         params (iterable): iterable of parameters to optimize or dicts defining
 50 |             parameter groups
 51 |         lr (float, optional): learning rate (default: 1e-3)
 52 |         betas (Tuple[float, float], optional): coefficients used for computing
 53 |             running averages of gradient and its square (default: (0.9, 0.999))
 54 |         eps (float, optional): term added to the denominator to improve
 55 |             numerical stability (default: 1e-8)
 56 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
 57 |         amsgrad (boolean, optional): whether to use the AMSGrad variant of this
 58 |             algorithm from the paper `On the Convergence of Adam and Beyond`_
 59 |             (default: False)
 60 |     .. _diffGrad: An Optimization Method for Convolutional Neural Networks:
 61 |         https://arxiv.org/abs/1909.11015
 62 |     .. _Adam\: A Method for Stochastic Optimization:
 63 |         https://arxiv.org/abs/1412.6980
 64 |     .. _On the Convergence of Adam and Beyond:
 65 |         https://openreview.net/forum?id=ryQu7f-RZ
 66 |     """
 67 | 
 68 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, version=0, weight_decay=0):
 69 |         if not 0.0 <= lr:
 70 |             raise ValueError("Invalid learning rate: {}".format(lr))
 71 |         if not 0.0 <= eps:
 72 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 73 |         if not 0.0 <= betas[0] < 1.0:
 74 |             raise ValueError(
 75 |                 "Invalid beta parameter at index 0: {}".format(betas[0]))
 76 |         if not 0.0 <= betas[1] < 1.0:
 77 |             raise ValueError(
 78 |                 "Invalid beta parameter at index 1: {}".format(betas[1]))
 79 | 
 80 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
 81 | 
 82 |         super().__init__(params, defaults)
 83 | 
 84 |         # save version
 85 |         self.version = version
 86 | 
 87 |     def __setstate__(self, state):
 88 |         super().__setstate__(state)
 89 | 
 90 |     def step(self, closure=None):
 91 |         """Performs a single optimization step.
 92 |         Arguments:
 93 |             closure (callable, optional): A closure that reevaluates the model
 94 |                 and returns the loss.
 95 |         """
 96 |         loss = None
 97 |         if closure is not None:
 98 |             loss = closure()
 99 | 
100 |         for group in self.param_groups:
101 |             for p in group['params']:
102 |                 if p.grad is None:
103 |                     continue
104 |                 grad = p.grad.data
105 |                 if grad.is_sparse:
106 |                     raise RuntimeError(
107 |                         'diffGrad does not support sparse gradients, please consider SparseAdam instead')
108 | 
109 |                 state = self.state[p]
110 | 
111 |                 # State initialization
112 |                 if len(state) == 0:
113 |                     state['step'] = 0
114 |                     # Exponential moving average of gradient values
115 |                     state['exp_avg'] = torch.zeros_like(p.data)
116 |                     # Exponential moving average of squared gradient values
117 |                     state['exp_avg_sq'] = torch.zeros_like(p.data)
118 |                     # Previous gradient
119 |                     state['previous_grad'] = torch.zeros_like(p.data)
120 | 
121 |                 exp_avg, exp_avg_sq, previous_grad = state['exp_avg'], state['exp_avg_sq'], state['previous_grad']
122 |                 beta1, beta2 = group['betas']
123 | 
124 |                 state['step'] += 1
125 | 
126 |                 if group['weight_decay'] != 0:
127 |                     grad.add_(group['weight_decay'], p.data)
128 | 
129 |                 # Decay the first and second moment running average coefficient
130 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
131 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
132 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
133 | 
134 |                 bias_correction1 = 1 - beta1 ** state['step']
135 |                 bias_correction2 = 1 - beta2 ** state['step']
136 | 
137 |                 # compute diffgrad coefficient (dfc)
138 | 
139 |                 if self.version == 0:
140 |                     diff = abs(previous_grad - grad)
141 |                 elif self.version == 1:
142 |                     diff = previous_grad-grad
143 |                 elif self.version == 2:
144 |                     diff = .5*abs(previous_grad - grad)
145 | 
146 |                 if self.version == 0 or self.version == 1:
147 |                     dfc = 1. / (1. + torch.exp(-diff))
148 |                 elif self.version == 2:
149 |                     # DFC2 = 9/(1+e-(.5/g/)-4 #range .5,5
150 |                     dfc = 9. / (1. + torch.exp(-diff))-4
151 | 
152 |                 state['previous_grad'] = grad
153 | 
154 |                 # update momentum with dfc
155 |                 exp_avg1 = exp_avg * dfc
156 | 
157 |                 step_size = group['lr'] * \
158 |                     math.sqrt(bias_correction2) / bias_correction1
159 | 
160 |                 p.data.addcdiv_(-step_size, exp_avg1, denom)
161 | 
162 |         return loss
163 | 
164 | 
165 | class LookaheadDiffGrad(torch.optim.Optimizer):
166 |     def __init__(self,
167 |                  params,
168 |                  lr=1e-3,
169 |                  betas=(0.9, 0.999),
170 |                  eps=1e-8,
171 |                  weight_decay=0,
172 |                  alpha=0.5,
173 |                  k=6):
174 | 
175 |         if not 0.0 <= alpha <= 1.0:
176 |             raise ValueError(f'Invalid slow update rate: {alpha}')
177 |         if not 1 <= k:
178 |             raise ValueError(f'Invalid lookahead steps: {k}')
179 | 
180 |         base_optimizer = DiffGrad(
181 |             params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
182 |         self.buffer = [[None, None, None] for ind in range(10)]
183 |         self.optimizer = base_optimizer
184 |         self.param_groups = self.optimizer.param_groups
185 |         self.alpha = alpha
186 |         self.k = k
187 |         for group in self.param_groups:
188 |             group["step_counter"] = 0
189 |         self.slow_weights = [[p.clone().detach() for p in group['params']]
190 |                              for group in self.param_groups]
191 | 
192 |         for w in it.chain(*self.slow_weights):
193 |             w.requires_grad = False
194 | 
195 |         self.state = base_optimizer.state
196 | 
197 |     def step(self, closure=None):
198 |         loss = None
199 |         if closure is not None:
200 |             loss = closure()
201 |         loss = self.optimizer.step()
202 |         for group, slow_weights in zip(self.param_groups, self.slow_weights):
203 |             group['step_counter'] += 1
204 |             if group['step_counter'] % self.k != 0:
205 |                 continue
206 |             for p, q in zip(group['params'], slow_weights):
207 |                 if p.grad is None:
208 |                     continue
209 |                 q.data.add_(self.alpha, p.data - q.data)
210 |                 p.data.copy_(q.data)
211 |                 self.state = self.optimizer.state
212 |         return loss
213 | 
214 | 
215 | class RAdam(torch.optim.Optimizer):
216 |     # from https://github.com/LiyuanLucasLiu/RAdam/blob/master/radam.py
217 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
218 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
219 |         self.buffer = [[None, None, None] for ind in range(10)]
220 |         super(RAdam, self).__init__(params, defaults)
221 | 
222 |     def __setstate__(self, state):
223 |         super(RAdam, self).__setstate__(state)
224 | 
225 |     def step(self, closure=None):
226 | 
227 |         loss = None
228 |         if closure is not None:
229 |             loss = closure()
230 | 
231 |         for group in self.param_groups:
232 | 
233 |             for p in group['params']:
234 |                 if p.grad is None:
235 |                     continue
236 |                 grad = p.grad.data.float()
237 |                 if grad.is_sparse:
238 |                     raise RuntimeError(
239 |                         'RAdam does not support sparse gradients')
240 | 
241 |                 p_data_fp32 = p.data.float()
242 | 
243 |                 state = self.state[p]
244 | 
245 |                 if len(state) == 0:
246 |                     state['step'] = 0
247 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
248 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
249 |                 else:
250 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
251 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(
252 |                         p_data_fp32)
253 | 
254 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
255 |                 beta1, beta2 = group['betas']
256 | 
257 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
258 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
259 | 
260 |                 state['step'] += 1
261 |                 buffered = self.buffer[int(state['step'] % 10)]
262 |                 if state['step'] == buffered[0]:
263 |                     N_sma, step_size = buffered[1], buffered[2]
264 |                 else:
265 |                     buffered[0] = state['step']
266 |                     beta2_t = beta2 ** state['step']
267 |                     N_sma_max = 2 / (1 - beta2) - 1
268 |                     N_sma = N_sma_max - 2 * \
269 |                         state['step'] * beta2_t / (1 - beta2_t)
270 |                     buffered[1] = N_sma
271 | 
272 |                     # more conservative since it's an approximated value
273 |                     if N_sma >= 5:
274 |                         step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (
275 |                             N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
276 |                     else:
277 |                         step_size = 1.0 / (1 - beta1 ** state['step'])
278 |                     buffered[2] = step_size
279 | 
280 |                 if group['weight_decay'] != 0:
281 |                     p_data_fp32.add_(-group['weight_decay']
282 |                                      * group['lr'], p_data_fp32)
283 | 
284 |                 # more conservative since it's an approximated value
285 |                 if N_sma >= 5:
286 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
287 |                     p_data_fp32.addcdiv_(-step_size *
288 |                                          group['lr'], exp_avg, denom)
289 |                 else:
290 |                     p_data_fp32.add_(-step_size * group['lr'], exp_avg)
291 | 
292 |                 p.data.copy_(p_data_fp32)
293 | 
294 |         return loss
295 | 
296 | 
297 | class LookaheadRAdam(torch.optim.Optimizer):
298 |     def __init__(self,
299 |                  params,
300 |                  lr=1e-3,
301 |                  betas=(0.9, 0.999),
302 |                  eps=1e-8,
303 |                  weight_decay=0,
304 |                  alpha=0.5,
305 |                  k=6):
306 | 
307 |         if not 0.0 <= alpha <= 1.0:
308 |             raise ValueError(f'Invalid slow update rate: {alpha}')
309 |         if not 1 <= k:
310 |             raise ValueError(f'Invalid lookahead steps: {k}')
311 | 
312 |         base_optimizer = RAdam(params, lr=lr, betas=betas,
313 |                                eps=eps, weight_decay=weight_decay)
314 |         self.buffer = [[None, None, None] for ind in range(10)]
315 |         self.optimizer = base_optimizer
316 |         self.param_groups = self.optimizer.param_groups
317 |         self.alpha = alpha
318 |         self.k = k
319 |         for group in self.param_groups:
320 |             group["step_counter"] = 0
321 |         self.slow_weights = [[p.clone().detach() for p in group['params']]
322 |                              for group in self.param_groups]
323 | 
324 |         for w in it.chain(*self.slow_weights):
325 |             w.requires_grad = False
326 | 
327 |         self.state = base_optimizer.state
328 | 
329 |     def step(self, closure=None):
330 |         loss = None
331 |         if closure is not None:
332 |             loss = closure()
333 |         loss = self.optimizer.step()
334 |         for group, slow_weights in zip(self.param_groups, self.slow_weights):
335 |             group['step_counter'] += 1
336 |             if group['step_counter'] % self.k != 0:
337 |                 continue
338 |             for p, q in zip(group['params'], slow_weights):
339 |                 if p.grad is None:
340 |                     continue
341 |                 q.data.add_(self.alpha, p.data - q.data)
342 |                 p.data.copy_(q.data)
343 |                 self.state = self.optimizer.state
344 | 


--------------------------------------------------------------------------------
/aimlutils/echo/README.md:
--------------------------------------------------------------------------------
  1 | # **E**arth **C**omputing **H**yperparameter **O**ptimization (ECHO): A distributed hyperparameter optimization package build with Optuna
  2 | 
  3 | ### Usage
  4 | 
  5 | Run the hyperparameter optimization script:
  6 | ```python
  7 | python optimize.py hyperparameters.yml model_config.yml
  8 | ```
  9 | Run the report script to get a dataframe of the results saved in the study:
 10 | ```python
 11 | python report.py hyperparameters.yml [-p plot_config.yml]
 12 | ```
 13 | ### Dependencies
 14 | 
 15 | There are three files that must be supplied to use the optimize script:
 16 | 
 17 | * A custom objective class that trains your model and returns the metric to be optimized.
 18 | 
 19 | * A configuration file specifying the hyperparameter optimization settings.
 20 | 
 21 | * A model configuration file that contains the information needed to train your model (see examples in the holodec and gecko projects).
 22 | 
 23 | ### Custom objective class
 24 | 
 25 | The custom **Objective** class (objective.py) must inherit a **BaseObjective** class (which lives in base_objective.py), and must contain a method named **train** that returns the value of the optimization metric (in a dictionary, see below). There are example objective scripts for both torch and Keras in the examples directory. Your custom Objective class will inherit all of the methods and attributes from the BaseObjective. The Objective's train does not depend on the machine learning library used! For example, a simple template has the following structure:
 26 | 
 27 | ```python
 28 | from aimlutils.echo.src.base_objective import *
 29 | from aimlutils.echo.src.pruning import KerasPruningCallback
 30 | 
 31 | class Objective(BaseObjective):
 32 | 
 33 |     def __init__(self, config, metric = "val_loss", device = "cpu"):
 34 | 
 35 |         # Initialize the base class
 36 |         BaseObjective.__init__(self, config, metric, device)
 37 | 
 38 |     def train(self, trial, conf):
 39 | 
 40 |         # Make any custom edits to the model conf before using it to train a model.
 41 |         conf = custom_updates(trial, conf)
 42 | 
 43 |         ... (load data sets, build model, etc)
 44 | 
 45 |         callbacks = [KerasPruningCallback(trial, self.metric, interval = 1)]
 46 |         result = Model.fit(..., callbacks = callbacks)
 47 | 
 48 |         results_dictionary = {
 49 |             "val_loss": result["val_loss"],
 50 |             "loss": result["loss"],
 51 |             ...
 52 |             "val_accuracy": result["val_accuracy"]
 53 |         }
 54 |         return results_dictionary
 55 | ```
 56 | You can have as many inputs to your custom Objective as needed, as long as those that are required to initialize the base class are included. The Objective class will call the train method from the inherited thunder **__call__** method, and will finish up by calling the inherited save method that writes the metric(s) details to disk. Note that, due to the inheritance of the one class on the other, you do not have to supply these two methods, as they are in pre-coded in the base class. You can customize them at your leisure using overriding methods in your custom Objective. Check out the scripts base_objective.py and run.py to see how things are structured and called.
 57 | 
 58 | As noted, the metric used to toggle the model's training performance must be in the results dictionary. Other metrics that the user may want to track will be saved to disk if they are included in the results dictionary (the keys of the dictionary are used to name the columns in a pandas dataframe). See the example above where several metrics are being returned.
 59 | 
 60 | Note that the first line in the train method states that any custom changes to the model configuration (conf) must be done here. If custom changes are required, the user may supply a method named **custom_updates** in addition to the Objective class (you may save both in the same script, or import the method from somewhere else in your custom Objective script). See also the section **Custom model configuration updates** below for an example. 
 61 | 
 62 | Finally, if using Keras, you need to include the (customized) KerasPruningCallback that will allow optuna to terminate unpromising trials. We do something similar when using torch -- see the examples directory.
 63 | 
 64 | ### Hyperparameter optimizer configuration
 65 | 
 66 | There are several fields: log, slurm, pbs, optuna, and variable subfields within each field. The log field allows us to save a file for printing messages and warnings that are placed in areas throughout the package. The slurm/pbs fields allows the user to specify how many GPU nodes should be used, and supports any slurm setting. The optuna field allows the user to configure the optimization procedure, including specifying which parameters will be used, as well as the performance metric. For example, consider the configuration settings:
 67 | 
 68 | ```yaml
 69 | pbs:
 70 |   jobs: 10
 71 |   kernel: "ncar_pylib /glade/work/schreck/py37"
 72 |   bash: ["module load ncarenv/1.3 gnu/8.3.0 openmpi/3.1.4 python/3.7.5 cuda/10.1"]
 73 |   batch:
 74 |     l: ["select=1:ncpus=8:ngpus=1:mem=128GB", "walltime=12:00:00"]
 75 |     A: "NAML0001"
 76 |     q: "casper"
 77 |     N: "echo_trial"
 78 |     o: "echo_trial.out"
 79 |     e: "echo_trial.err"
 80 | slurm:
 81 |   jobs: 15
 82 |   kernel: "ncar_pylib /glade/work/schreck/py37"
 83 |   bash: ["module load ncarenv/1.3 gnu/8.3.0 openmpi/3.1.4 python/3.7.5 cuda/10.1"]
 84 |   batch:
 85 |     account: "NAML0001"
 86 |     gres: "gpu:v100:1"
 87 |     mem: "128G"
 88 |     n: 8
 89 |     t: "12:00:00"
 90 |     J: "echo_trial"
 91 |     o: "echo_trial.out"
 92 |     e: "echo_trial.err"
 93 | optuna:
 94 |   study_name: "holodec_optimization"
 95 |   storage: "sqlite:///path/to/data/storage.db
 96 |   reload: 0
 97 |   objective: "examples/torch/objective.py"
 98 |   metric: "val_loss"
 99 |   direction: "minimize"
100 |   n_trials: 500
101 |   gpu: True
102 |   save_path: 'test'
103 |   sampler:
104 |     type: "TPESampler"
105 |     n_startup_trials: 30 
106 |   parameters:
107 |     num_dense:
108 |       type: "int"
109 |       settings:
110 |         name: "num_dense"
111 |         low: 0
112 |         high: 10
113 |     dropout:
114 |       type: "float"
115 |       settings:
116 |         name: "dr"
117 |         low: 0.0
118 |         high: 0.5
119 |     optimizer:learning_rate:
120 |       type: "loguniform"
121 |       settings:
122 |         name: "lr"
123 |         low: 0.0000001
124 |         high: 0.01
125 |     model:activation:
126 |       type: "categorical"
127 |       settings:
128 |         name: "activation"
129 |         choices: ["relu", "linear", "leaky", "elu", "prelu"]
130 | log [optional]:
131 |   save_path: "path/to/data/log.txt"
132 | ```
133 | 
134 | The subfields within "pbs" and slurm" should mostly be familiar to you. In this example there would be 10 jobs submitted to pbs queue and 15 jobs to the slurm queue. The kernel field is optional and can be any call(s) to activate a conda/python/ncar_pylib/etc environment. Additional snippets that you might need in your launch script can be added to the list in the "bash" field. For example, as in the example above, loading modules before training a model is required. Note that the bash options will be run in order, and before the kernel field. Remove or leave the kernel field blank if you do not need it.
135 | 
136 | The subfields within the "optuna" field have the following functionality:
137 | 
138 | * study_name: The name of the study.
139 | * storage: sqlite or mysql destination.
140 | * reload: Whether to continue using a previous study (True) or to initialize a new study (False). If your initial number of workers do not reach the number of trials and you wish to resubmit, set to True.
141 | * objective: The path to the user-supplied objective class (it must be named objective.py)
142 | * metric: The metric to be used to determine the model performance. 
143 | * direction: Indicates which direction the metric must go to represent improvement (pick from maximimize or minimize)
144 | * n_trials: The number of trials in the study.
145 | * gpu: Use the gpu or cpu.
146 | * save_path: Directory path where data will be saved. 
147 | * sampler
148 |   + type: Choose how optuna will do parameter estimation. The default choice both here and in optuna is the [Tree-structured Parzen Estimator Approach](https://towardsdatascience.com/a-conceptual-explanation-of-bayesian-model-based-hyperparameter-optimization-for-machine-learning-b8172278050f), [e.g. TPESampler](https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf). See the optuna documentation for the different options. For some samplers (e.g. GridSearch) additional fields may be included (e.g. search_space). 
149 | * parameters
150 |   + type: Option to select an optuna trial setting. See the [optuna Trial documentation](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html?highlight=suggest#optuna.trial.Trial.suggest_uniform) for what is available. Currently, this package supports the available options from optuna: "categorical", "discrete_uniform", "float", "int", "loguniform", and "uniform".
151 |   + settings: This dictionary field allows you to specify any settings that accompany the optuna trial type. In the example above, the named num_dense parameter is stated to be an integer with values ranging from 0 to 10. To see all the available options, consolt the [optuna Trial documentation](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html?highlight=suggest#optuna.trial.Trial.suggest_uniform)
152 |   
153 | Lastly, the "log" field allows you to save the logging details to file; they will always be printed to stdout. If this field is removed, logging details will only be printed to stdout.
154 | 
155 | ### Model configuration
156 | 
157 | The model configuration file can be what you had been using up to this point to train your model, in other words no changes are necessary. This package will take the suggested hyperparameters from an optuna trial and make changes to the model configuration on the fly. This can either be done automatically with this package, or the user may supply an additional method for making custom changes. For example, consider the (truncated) configuration for training a model to predict hologram properties with a holodec dataset:
158 | 
159 | ```yaml
160 | model:
161 |   image_channels: 1
162 |   hidden_dims: [3, 94, 141, 471, 425, 1122]
163 |   z_dim: 1277
164 |   dense_hidden_dims: [1000]
165 |   dense_dropouts: [0.0]
166 |   tasks: ["x", "y", "z", "d", "binary"]
167 |   activation: "relu"
168 | optimizer:
169 |   type: "lookahead-diffgrad"
170 |   learning_rate: 0.000631
171 |   weight_decay: 0.0
172 | trainer:
173 |   start_epoch: 0
174 |   epochs: 1
175 |   clip: 1.0
176 |   alpha: 1.0
177 |   beta: 0.1
178 |   path_save: "test"
179 |  ```
180 | The model configuration will be automatically updated if and only if the name of the parameter specified in the hyperparameter configuration, optuna.parameters can be used as a nested lookup key in the model configuration file. For example, observe in the hyperparameter configuration file above that the named parameter **optimizer:learning_rate** contains a colon, that is downstream used to split the named parameter into multiple keys that allow us to, starting from the top of the nested tree in the model configuration file, work our way down until the relevant field is located and the trial-suggested value is substituted in. In this example, the split keys are ["optimizer", "learning_rate"]. 
181 | 
182 | This scheme will work in general as long as the named parameter in optuna.parameters uses : as the separator, and once split, the resulting list can be used to locate the relevant field in the model configuration.
183 | 
184 | Note that optuna has a limited range of trial parameters types; all but one them being numerical in one form or another. If you wanted to optimize the activation layer(s) in your neural network as in the example above, you could go about that by utilizing the "categorical" trial suggestor. For example, the following list of activation layer names could be specified: ["relu", "linear", "leaky", "elu", "prelu"].
185 | 
186 | 
187 | ### Custom model configuration updates
188 | 
189 | You may additionally supply rules for updating the model configuration file, by including a method named **custom_updates**, which will make the desired changes to the configuration file with optuna trail parameter guesses.
190 | 
191 | In the example configurations described above, the hyperparameter configuration contained an optuna.parameters field "num_dense," but this field is not present in the model configuration. There is however a "dense_hiddden_dims" field in the model configuration that contains a list of the layer sizes in the model, where the number of layers is the length of the list. In our example just one layer specified but we want to vary that number. 
192 | 
193 | To use the "num_dense" hyperparameter from the hyperparameter configuration file, we can create the following method:
194 | 
195 | ```python
196 | def custom_updates(trial, conf):
197 | 
198 |     # Get list of hyperparameters from the config
199 |     hyperparameters = conf["optuna"]["parameters"]
200 | 
201 |     # Now update some via custom rules
202 |     num_dense = trial.suggest_discrete_uniform(**hyperparameters["num_dense"])
203 | 
204 |     # Update the config based on optuna's suggestion
205 |     conf["model"]["dense_hidden_dims"] = [1000 for k in range(num_dense)]        
206 | 
207 |     return conf 
208 | ```
209 | 
210 | The method should be called first thing in the custom Objective.train method (see the example Objective above). You may have noticed that the configuration (named conf) contains both hyperparameter and model fields. This package will copy the hyperparameter optuna field to the model configuration for convenience, so that we can reduce the total number of class and method dependencies (which helps me keep the code generalized). This occurs in the run.py script.
211 | 
212 | ### Custom plot settings for report.py
213 | 
214 | The script report.py will load the current study, identify the best trial in the study, and will compute the relative importantance of each parameter using both fanova and MDI (see [here](https://optuna.readthedocs.io/en/v1.3.0/reference/importance.html) for details). 
215 | 
216 | Additionally, the script will create two figures, an optimization history plot and an intermediate values plot. If your metric returns two values to be optimized, only the pareto front plot will be generated. See the [documentation](https://optuna.readthedocs.io/en/v1.3.0/reference/visualization.html) for details on the plots. 
217 | 
218 | Note that ECHO only supports the [matplotlib](https://optuna.readthedocs.io/en/latest/reference/visualization/matplotlib.html) generated plots from Optuna, for now. Optuna's default is to use plot.ly, however not all LTS Jupyter-lab environments support that backend.
219 | 
220 | The user may customize the plots to a degree, by additionally supplying a plot configuration yaml file (named plot_config.yml above, and called as an optional argument using the parser -p or --plot). Currently, the user may only adjust the rcParam backend variables (see [here](https://matplotlib.org/3.3.3/tutorials/introductory/customizing.html) for a comprehensive list) plus a limited set of other variables (see below), 
221 | 
222 | ```yaml
223 | optimization_history: 
224 |     save_path: '/glade/work/schreck/repos/holodec-ml/scripts/schreck/decoder/results/opt_multi_particle'
225 |     set_xlim: [0, 100]
226 |     set_ylim: [3e4, 1e6]
227 |     set_xscale: "log"
228 |     set_yscale: "log"
229 |     rcparams: 
230 |         'backend': 'ps'
231 |         'lines.markersize'  : 4
232 |         'axes.labelsize': 10
233 |         'legend.fontsize': 10
234 |         'xtick.labelsize': 10
235 |         'ytick.labelsize': 10
236 |         'xtick.top': True
237 |         'xtick.bottom': True
238 |         'ytick.right': True
239 |         'ytick.left': True
240 |         'xtick.direction': 'in'
241 |         'ytick.direction': 'in'
242 |         'font.serif'    : 'Helvetica'
243 |         'figure.dpi'       : 600
244 |         'figure.autolayout': True
245 |         'legend.numpoints'     : 1
246 |         'legend.handlelength'  : 1.0
247 |         'legend.columnspacing' : 1.0
248 | ```
249 | 
250 | For the other supported plots, simply add or change "optimization_history" to "intermediate_values", or if optimizing more than one metric, "pareto_front".
251 | 


--------------------------------------------------------------------------------
/aimlutils/echo/optimize.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | warnings.filterwarnings("ignore")
  3 | 
  4 | import os
  5 | import sys
  6 | import yaml
  7 | import optuna
  8 | import logging
  9 | import subprocess
 10 | from argparse import ArgumentParser
 11 | from aimlutils.echo.src.samplers import samplers
 12 | from typing import Dict
 13 | 
 14 | 
 15 | def args():
 16 |     parser = ArgumentParser(description=
 17 |         "ECHO: A distributed multi-gpu hyperparameter optimization package build with Optuna"
 18 |     )
 19 | 
 20 |     parser.add_argument("hyperparameter", type=str, help=           
 21 |             "Path to the hyperparameter configuration containing your inputs."
 22 |     )
 23 |     
 24 |     parser.add_argument("model", type=str, help=
 25 |             "Path to the model configuration containing your inputs."
 26 |     )
 27 |     parser.add_argument(
 28 |         "-n",
 29 |         "--study_name", 
 30 |         dest="study_name", 
 31 |         type=str,
 32 |         default=False, 
 33 |         help="The name of the study"
 34 |     )
 35 |     parser.add_argument(
 36 |         "--override", 
 37 |         dest="override", 
 38 |         type=bool,
 39 |         default=False,
 40 |         help="Force remove the study name from the storage"
 41 |     )
 42 |     parser.add_argument(
 43 |         "-r", 
 44 |         "--reload", 
 45 |         dest="reload", 
 46 |         type=str,
 47 |         default=False, 
 48 |         help="Set = 0 to initiate a new study, = 1 to continue a study"
 49 |     )
 50 |     parser.add_argument(
 51 |         "-o", 
 52 |         "--objective", 
 53 |         dest="objective", 
 54 |         type=str,
 55 |         default=False, 
 56 |         help="Path to the supplied objective class"
 57 |     )
 58 |     parser.add_argument(
 59 |         "-d", 
 60 |         "--direction", 
 61 |         dest="direction", 
 62 |         type=str,
 63 |         default=False, 
 64 |         help="Direction of the metric. Choose from maximize or minimize"
 65 |     )
 66 |     parser.add_argument(
 67 |         "-m", 
 68 |         "--metric", 
 69 |         dest="metric", 
 70 |         type=str,
 71 |         default=False, 
 72 |         help="The validation metric"
 73 |     )
 74 |     parser.add_argument(
 75 |         "-t", 
 76 |         "--trials", 
 77 |         dest="n_trials", 
 78 |         type=str,
 79 |         default=False, 
 80 |         help="The number of trials in the study"
 81 |     )
 82 |     parser.add_argument(
 83 |         "-g", 
 84 |         "--gpu", 
 85 |         dest="gpu", 
 86 |         type=str,
 87 |         default=False, 
 88 |         help="Use the gpu or not (bool)"
 89 |     )
 90 |     parser.add_argument(
 91 |         "-s", 
 92 |         "--save_path", 
 93 |         dest="save_path", 
 94 |         type=str,
 95 |         default=False, 
 96 |         help="Path to the save directory"
 97 |     )   
 98 |     parser.add_argument(
 99 |         "-c", 
100 |         "--create_study", 
101 |         dest="create_study", 
102 |         type=str,
103 |         default=False, 
104 |         help="Create a study but do not submit any workers"
105 |     )   
106 |     return vars(parser.parse_args())
107 | 
108 | 
109 | def fix_broken_study(_study: optuna.study.Study, 
110 |                      name: str, 
111 |                      storage: str, 
112 |                      direction: str, 
113 |                      sampler: optuna.samplers.BaseSampler):
114 |     
115 |     """
116 |         This method removes broken trials, which are those 
117 |         that failed to complete 1 epoch before slurm (or something else) killed the job
118 |         and returned NAN or NONE.
119 |         
120 |         Failure to remove these trails leads to a error when optuna tries to update the 
121 |         parameters. This is because these trails only have "NoneType" data associated 
122 |         with them, but we need numerical data (e.g. the loss value) to update parameters.
123 |     """
124 |     
125 |     if len(_study.trials) == 0:
126 |         return _study, []
127 |     
128 |     trials = []
129 |     removed = []
130 |     for trial in _study.trials:
131 |         if len(trial.intermediate_values) == 0:
132 |             trials.append(trial)
133 |             continue
134 |         step, intermediate_value = max(trial.intermediate_values.items())
135 |         if intermediate_value is not None:
136 |             trials.append(trial)
137 |         else:
138 |             removed.append(trial.number+1)
139 |             
140 |     if len(removed) == 0:
141 |         return _study, []
142 |     
143 |     # Delete the current study
144 |     optuna.delete_study(study_name=name, storage=storage)
145 |     
146 |     # Create a new one in its place
147 |     if isinstance(direction, str):
148 |         study_fixed = optuna.create_study(study_name=name, 
149 |                                       storage=storage, 
150 |                                       direction=direction,
151 |                                       sampler=sampler,
152 |                                       load_if_exists=False)
153 |     else:
154 |         study_fixed = optuna.multi_objective.create_study(
155 |             study_name=name,
156 |             storage=storage,
157 |             directions=direction,
158 |             sampler=sampler,
159 |             load_if_exists=False
160 |         )
161 |     
162 |     # Add the working trials to the new study
163 |     for trial in trials:
164 |         study_fixed.add_trial(trial)
165 |         
166 |     return study_fixed, removed
167 | 
168 | 
169 | def prepare_slurm_launch_script(hyper_config: str, 
170 |                                 model_config: str):
171 |     
172 |     slurm_options = ["#!/bin/bash -l"]
173 |     slurm_options += [
174 |         f"#SBATCH -{arg} {val}" if len(arg) == 1 else f"#SBATCH --{arg}={val}" 
175 |         for arg, val in hyper_config["slurm"]["batch"].items()
176 |     ]
177 |     if "bash" in hyper_config["slurm"]:
178 |         if len(hyper_config["slurm"]["bash"]) > 0:
179 |             for line in hyper_config["slurm"]["bash"]:
180 |                 slurm_options.append(line)
181 |     if "kernel" in hyper_config["slurm"]:
182 |         if hyper_config["slurm"]["kernel"] is not None:
183 |             slurm_options.append(f'{hyper_config["slurm"]["kernel"]}')
184 |     import aimlutils.echo as opt
185 |     aiml_path = os.path.join(
186 |         os.path.abspath(opt.__file__).strip("__init__.py"), 
187 |         "run.py"
188 |     )
189 |     slurm_options.append(f"python {aiml_path} {sys.argv[1]} {sys.argv[2]}")
190 |     return slurm_options
191 | 
192 | 
193 | def prepare_pbs_launch_script(hyper_config: str, 
194 |                                 model_config: str):
195 |     
196 |     pbs_options = ["#!/bin/bash -l"]
197 |     for arg, val in hyper_config["pbs"]["batch"].items():
198 |         if arg == "l" and type(val) == list:
199 |             for opt in val:
200 |                 pbs_options.append(f"#PBS -{arg} {opt}")
201 |         elif len(arg) == 1:
202 |             pbs_options.append(f"#PBS -{arg} {val}")
203 |         else:
204 |             pbs_options.append(f"#PBS --{arg}={val}")     
205 |     if "bash" in hyper_config["pbs"]:
206 |         if len(hyper_config["pbs"]["bash"]) > 0:
207 |             for line in hyper_config["pbs"]["bash"]:
208 |                 pbs_options.append(line)
209 |     if "kernel" in hyper_config["pbs"]:
210 |         if hyper_config["pbs"]["kernel"] is not None:
211 |             pbs_options.append(f'{hyper_config["pbs"]["kernel"]}')
212 |     import aimlutils.echo as opt
213 |     aiml_path = os.path.join(
214 |         os.path.abspath(opt.__file__).strip("__init__.py"), 
215 |         "run.py"
216 |     )
217 |     pbs_options.append(f"python {aiml_path} {sys.argv[1]} {sys.argv[2]}")
218 |     return pbs_options
219 | 
220 | 
221 | def configuration_report(_dict: Dict[str, str], 
222 |                          path: bool = None):
223 |     
224 |     if path is None:
225 |         path = []
226 |     for k,v in _dict.items():
227 |         newpath = path + [k]
228 |         if isinstance(v, dict):
229 |             for u in configuration_report(v, newpath):
230 |                 yield u
231 |         else:
232 |             yield newpath, v
233 | 
234 | 
235 | if __name__ == "__main__":
236 |     
237 |     args_dict = args()
238 | 
239 |     hyper_config = args_dict.pop("hyperparameter")
240 |     model_config = args_dict.pop("model")
241 | 
242 |     if not hyper_config or not model_config:
243 |         raise OSError(
244 |             "Usage: python main.py hyperparameter.yml model.yml [optional parser options]"
245 |         )
246 | 
247 |     if os.path.isfile(hyper_config):
248 |         with open(hyper_config) as f:
249 |             hyper_config = yaml.load(f, Loader=yaml.FullLoader)
250 |     else:
251 |         raise OSError(
252 |             f"Hyperparameter optimization config file {sys.argv[1]} does not exist"
253 |         )
254 | 
255 |     if os.path.isfile(model_config):
256 |         with open(model_config) as f:
257 |             model_config = yaml.load(f, Loader=yaml.FullLoader)
258 |     else:
259 |         raise OSError(
260 |             f"Model config file {sys.argv[1]} does not exist"
261 |         )        
262 |         
263 |     # Set up a logger
264 |     root = logging.getLogger()
265 |     root.setLevel(logging.DEBUG)
266 |     formatter = logging.Formatter('%(levelname)s:%(name)s:%(message)s')
267 |     
268 |     # Stream output to stdout
269 |     ch = logging.StreamHandler()
270 |     ch.setLevel(logging.INFO)
271 |     ch.setFormatter(formatter)
272 |     root.addHandler(ch)
273 |     
274 |     # Stream output to file
275 |     if "log" in hyper_config:
276 |         savepath = hyper_config["log"]["save_path"] if "save_path" in hyper_config["log"] else "log.txt"
277 |         mode = "a+" if bool(hyper_config["optuna"]["reload"]) else "w"
278 |         fh = logging.FileHandler(savepath,
279 |                                  mode=mode,
280 |                                  encoding='utf-8')
281 |         fh.setLevel(logging.DEBUG)
282 |         fh.setFormatter(formatter)
283 |         root.addHandler(fh)
284 |         
285 |     # Override other options in hyperparameter config file, if supplied.
286 |     for name, val in args_dict.items():
287 |         if val and (name in hyper_config):
288 |             current_value = hyper_config["optuna"][name]
289 |             logging.info(
290 |                 f"Overriding {name} in the hyperparameter configuration: {current_value} -> {val}"
291 |             )
292 |             hyper_config["optuna"][name] = val
293 |         
294 |     # Print the configurations to the logger
295 |     logging.info("Current hyperparameter configuration settings:")
296 |     for p, v in configuration_report(hyper_config):
297 |         full_path = ".".join([str(_p) for _p in p])
298 |         logging.info(f"{full_path}: {v}")
299 |     logging.info("Current model configuration settings:")
300 |     for p, v in configuration_report(model_config):
301 |         full_path = ".".join([str(_p) for _p in p])
302 |         logging.info(f"{full_path}: {v}")
303 |         
304 |     # Set up new db entry if reload = 0 
305 |     reload_study = bool(hyper_config["optuna"]["reload"])
306 |         
307 |     # Check if save directory exists
308 |     if not os.path.isdir(hyper_config["optuna"]["save_path"]):
309 |         raise OSError(
310 |             f'Create the save directory {hyper_config["optuna"]["save_path"]} and try again'
311 |         )
312 |         
313 |     study_name = hyper_config["optuna"]["study_name"]
314 |     #path_to_study = os.path.join(hyper_config["optuna"]["save_path"], name)
315 |     #storage = f"sqlite:///{path_to_study}"
316 |     storage = hyper_config["optuna"]["storage"]
317 |     direction = hyper_config["optuna"]["direction"]
318 |     single_objective = isinstance(direction, str)
319 |     
320 |     # Initialize the sampler
321 |     if "sampler" not in hyper_config["optuna"]:
322 |         if single_objective: # single-objective
323 |             sampler = optuna.samplers.TPESampler()
324 |         else: # multi-objective equivalent of TPESampler
325 |             sampler = optuna.multi_objective.samplers.MOTPEMultiObjectiveSampler()
326 |     else:
327 |         sampler = samplers(hyper_config["optuna"]["sampler"])
328 | 
329 |     # Initiate a study for the first time
330 |     if not reload_study:
331 |         
332 |         # Check the direction
333 |         if isinstance(direction, list):
334 |             for direc in direction:
335 |                 if direc not in ["maximize", "minimize"]:
336 |                     raise OSError(
337 |                     f"Optimizer direction {direc} not recognized. Choose from maximize or minimize"
338 |                 )
339 |             
340 |         else:
341 |             if direction not in ["maximize", "minimize"]:
342 |                 raise OSError(
343 |                     f"Optimizer direction {direction} not recognized. Choose from maximize or minimize"
344 |                 )
345 |         
346 |         # Check if the study record already exists.
347 |         try:
348 |             optuna.load_study(
349 |                 study_name = study_name,
350 |                 storage = storage,
351 |                 #direction = direction,
352 |                 sampler = sampler
353 |             )
354 |         except KeyError: # The study name was not in storage, can proceed
355 |             pass
356 |         
357 |         except:
358 |             if args_dict["override"]:
359 |                 message = f"Removing the study_name {study_name} that exists in storage {storage}."
360 |                 optuna.delete_study(
361 |                     study_name = study_name,
362 |                     storage = storage,
363 |                     direction = direction,
364 |                     sampler = sampler
365 |             )
366 |             else:
367 |                 message = f"The study {study_name} already exists in storage and reload was False."
368 |                 message += f" Delete it from {storage}, and try again or rerun this script"
369 |                 message += f" with the flag: --override 1"
370 |                 raise OSError(message)
371 |                 
372 |         # Create a new study in the storage object
373 |         if single_objective:
374 |             create_study = optuna.create_study(
375 |                 study_name = study_name,
376 |                 storage = storage,
377 |                 direction = direction,
378 |                 sampler = sampler
379 |             )
380 |         else:
381 |             create_study = optuna.multi_objective.study.create_study(
382 |                 study_name = study_name,
383 |                 storage = storage,
384 |                 directions = direction,
385 |                 sampler = sampler
386 |             )
387 |             
388 |     # Check to see if there are any broken trials
389 |     else:
390 |         #if not os.path.isfile(path_to_study):
391 |         #    raise OSError("Reload was true but the study does not yet exist. Set reload = 0 and try again.")
392 |             
393 |         logging.info(
394 |             f"Checking the study for broken trials (those that did not complete 1 epoch before dying)"
395 |         )
396 |         if single_objective:
397 |             study = optuna.load_study(
398 |                 study_name = study_name,
399 |                 storage = storage, 
400 |                 sampler = sampler
401 |             )
402 |         else:
403 |             study = optuna.multi_objective.study.load_study(
404 |                 study_name = study_name,
405 |                 storage = storage, 
406 |                 sampler = sampler
407 |             )
408 |         study, removed = fix_broken_study(study, study_name, storage, direction, sampler)
409 |         
410 |         if len(removed):
411 |             logging.info(
412 |                 f"Removing problematic trials {removed}."
413 |             )
414 |         else:
415 |             logging.info("All trials check out!")
416 |             
417 |         
418 |     # Override to create the database but skip submitting jobs. 
419 |     create_db_only = True if args_dict["create_study"] else False
420 |     
421 |     # Stop here if arg is defined -- intention is that you manually run run.py for debugging purposes
422 |     if create_db_only:
423 |         logging.info(f"Created study {study_name} located at {storage}. Exiting.")
424 |         sys.exit()
425 |         
426 |     ###############
427 |     #
428 |     # SLURM SUPPORT
429 |     #
430 |     ###############
431 |                 
432 |     # Prepare launch script
433 |     if "slurm" in hyper_config:
434 |         launch_script = prepare_slurm_launch_script(hyper_config, model_config)
435 |     
436 |         # Save the configured script
437 |         script_path = hyper_config["optuna"]["save_path"]
438 |         script_location = os.path.join(script_path, "launch_slurm.sh")
439 |         with open(script_location, "w") as fid:
440 |             for line in launch_script:
441 |                 fid.write(f"{line}\n")
442 | 
443 |         # Launch the slurm jobs
444 |         job_ids = []
445 |         name_condition = "J" in hyper_config["slurm"]["batch"]
446 |         slurm_job_name = hyper_config["slurm"]["batch"]["J"] if name_condition else "echo_trial"
447 |         n_workers = hyper_config["slurm"]["jobs"]
448 |         for worker in range(n_workers):
449 |             w = subprocess.Popen(
450 |                 f"sbatch -J {slurm_job_name}_{worker} {script_location}",
451 |                 shell=True,
452 |                 stdout = subprocess.PIPE,
453 |                 stderr = subprocess.PIPE
454 |             ).communicate()
455 |             job_ids.append(
456 |                 w[0].decode("utf-8").strip("\n").split(" ")[-1]
457 |             )
458 |             logging.info(
459 |                 f"Submitted slurm batch job {worker + 1}/{n_workers} with id {job_ids[-1]}"
460 |             )
461 | 
462 |         # Write the job ids to file for reference
463 |         with open(os.path.join(script_path, "slurm_job_ids.txt"), "w") as fid:
464 |             for line in job_ids:
465 |                 fid.write(f"{line}\n")
466 |             
467 |     ###############
468 |     #
469 |     # PBS SUPPORT
470 |     #
471 |     ###############
472 |             
473 |     if "pbs" in hyper_config:
474 |         launch_script = prepare_pbs_launch_script(hyper_config, model_config)
475 |         
476 |         # Save the configured script
477 |         script_path = hyper_config["optuna"]["save_path"]
478 |         script_location = os.path.join(script_path, "launch_pbs.sh")
479 |         with open(script_location, "w") as fid:
480 |             for line in launch_script:
481 |                 fid.write(f"{line}\n")
482 | 
483 |         # Launch the slurm jobs
484 |         job_ids = []
485 |         name_condition = "J" in hyper_config["pbs"]["batch"]
486 |         slurm_job_name = hyper_config["pbs"]["batch"]["N"] if name_condition else "echo_trial"
487 |         n_workers = hyper_config["pbs"]["jobs"]
488 |         for worker in range(n_workers):
489 |             w = subprocess.Popen(
490 |                 f"qsub -N {slurm_job_name}_{worker} {script_location}",
491 |                 shell=True,
492 |                 stdout = subprocess.PIPE,
493 |                 stderr = subprocess.PIPE
494 |             ).communicate()
495 |             job_ids.append(
496 |                 w[0].decode("utf-8").strip("\n")
497 |             )
498 |             logging.info(
499 |                 f"Submitted pbs batch job {worker + 1}/{n_workers} with id {job_ids[-1]}"
500 |             )
501 | 
502 |         # Write the job ids to file for reference
503 |         with open(os.path.join(script_path, "pbs_job_ids.txt"), "w") as fid:
504 |             for line in job_ids:
505 |                 fid.write(f"{line}\n")
506 | 


--------------------------------------------------------------------------------
/blog/site/generators.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# (10/27/20) Methods, iterables, and generators for reading files"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "We all need to open, close, and save data to files using Python. Loading a file always involves reading the lines in the file, and possibly doing some processing on each line. An example file $\\textbf{test_data.txt}$ contains 1,000,000 rows (lines) and 3 columns separated by an empty space."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import time"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 3,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "fn = \"generators/test_data.txt\""
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "##### The Python Method"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 4,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "def method_loader(fn):\n",
 49 |     "    with open(fn) as fid:\n",
 50 |     "        lines = fid.readlines() # Read in lines until reaching EOF\n",
 51 |     "        lines = [line.strip(\"\\n\").split(\" \")[-1] for line in lines] # Process each line\n",
 52 |     "    return lines"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 5,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "for processed_line in method_loader(fn):\n",
 62 |     "    break"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "The entire file was read into memory with the .readlines() call, before any line processing was performed."
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "Reading files like this shouldn't be a problem for file sizes up to ~1G. But sometimes we have no choice and have to work with large files (sometimes hundreds of gigs). As a result the readlines operation can take a very long time. Furthermore, if the file is too large to load into memory, python will throw the error __MemoryError__ and your program will terminate *with error*."
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "We frequently encounter large datafiles at NCAR. What can we do about it? \n",
 84 |     "\n",
 85 |     "Reading and processing one line at a time would solve this problem. We could even process an \"infinitely\" large file, which means any file that's too large to load fully into memory.\n",
 86 |     "\n",
 87 |     "This kind of file reading is called __lazy__ reading."
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "##### The Python Generator"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "There is a special tool in the python toolbox that easily enables lazy reading called a generator. The generator object is built on top of python's Iterator object class, but I will cover them in reverse below."
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 14,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "def generator_loader(fn):\n",
111 |     "    with open(fn, \"r\") as fid:\n",
112 |     "        for line in fid:\n",
113 |     "            yield line.strip(\"\\n\").split(\" \")[-1]"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "We've replaced the return with something new named $\\color{green}{\\textbf{yield}}$. This chunk of code looks similar to the method_loader!\n",
121 |     "\n",
122 |     "Test it."
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 53,
128 |    "metadata": {},
129 |    "outputs": [
130 |     {
131 |      "name": "stdout",
132 |      "output_type": "stream",
133 |      "text": [
134 |       "0\n"
135 |      ]
136 |     }
137 |    ],
138 |    "source": [
139 |     "for processed_line in generator_loader(fn):\n",
140 |     "    print(processed_line)\n",
141 |     "    break # Stop early, I don't need to print 1,000,000 lines!"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "It behaves similarly when in use as compared to the method variant presented above. The big difference is that the generator version is more memory efficient because, through $\\color{green}{\\textbf{yield}}$, lines are read into memory one at a time, returned, released, ..., until reaching the end of the file (EOF). \n",
149 |     "\n",
150 |     "That is to say $\\color{green}{\\textbf{yield}}$ returns more than once, whereas $\\color{green}{\\textbf{return}}$ in a method signals the end (in terms of memory usage, as it is freed and the method is exited). "
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "Generators work nicely with serialized data ... you may have __*dumped*__ data using the pickle library before. The pickle library allows you to do that for the entire file, all in one go, or line-by-line as the example below illustrates:"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 16,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "import pickle"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 18,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "fn_pkl = \"generators/test_data.pkl\""
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 19,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "def write_to_pickle(data, fn):\n",
185 |     "    with open(fn, \"wb\") as fid:\n",
186 |     "        for line in data:\n",
187 |     "            pickle.dump(line, fid) # Iteration over .dump"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 21,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "write_to_pickle(\n",
197 |     "    method_loader(fn),\n",
198 |     "    fn_pkl\n",
199 |     ")"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "From here will assume that we do not know how many lines are in our serialized data dump:"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 22,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "def load_from_pickle(fn):\n",
216 |     "    with open(fn, \"rb\") as fid:\n",
217 |     "        while True: # Keep looping with while.\n",
218 |     "            yield pickle.load(fid) # Iteration over .load"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "metadata": {},
224 |    "source": [
225 |     "where the $\\color{green}{\\textbf{while True}}$ clause will keep looping over the call to load pickled data until we reach the end of the file. \n",
226 |     "\n",
227 |     "Test it!"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 23,
233 |    "metadata": {},
234 |    "outputs": [
235 |     {
236 |      "ename": "EOFError",
237 |      "evalue": "Ran out of input",
238 |      "output_type": "error",
239 |      "traceback": [
240 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
241 |       "\u001b[0;31mEOFError\u001b[0m                                  Traceback (most recent call last)",
242 |       "\u001b[0;32m<ipython-input-23-2de8c1965edf>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mload_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn_pkl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m     \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
243 |       "\u001b[0;32m<ipython-input-22-45015fa1bf7e>\u001b[0m in \u001b[0;36mload_from_pickle\u001b[0;34m(fn)\u001b[0m\n\u001b[1;32m      2\u001b[0m     \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfid\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m         \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# Keep looping with while.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m             \u001b[0;32myield\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfid\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# Iteration over .load\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
244 |       "\u001b[0;31mEOFError\u001b[0m: Ran out of input"
245 |      ]
246 |     }
247 |    ],
248 |    "source": [
249 |     "for row in load_from_pickle(fn_pkl):\n",
250 |     "    continue"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "metadata": {},
256 |    "source": [
257 |     "It failed! \n",
258 |     "\n",
259 |     "There must still be a signal that can be used to stop the geneartor from yielding the next line when it does not exist. \n",
260 |     "\n",
261 |     "Note that python threw an end-of-file error $\\color{red}{\\textbf{EOFError}}$. We can catch that and use it to exit the generator:"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 24,
267 |    "metadata": {},
268 |    "outputs": [],
269 |    "source": [
270 |     "def load_from_pickle(fn):\n",
271 |     "    with open(fn, \"rb\") as fid:\n",
272 |     "        try:\n",
273 |     "            while True: # We do not necessarily know how many lines are in fn\n",
274 |     "                yield pickle.load(fid)               \n",
275 |     "        except EOFError:\n",
276 |     "            pass # Do nothing and leave read_from_pickle without error "
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "markdown",
281 |    "metadata": {},
282 |    "source": [
283 |     "Now it will run without error:"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 25,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "for row in load_from_pickle(fn_pkl):\n",
293 |     "    continue\n",
294 |     "    \n",
295 |     "# Finishes without error"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "markdown",
300 |    "metadata": {},
301 |    "source": [
302 |     "This is rather clunky! Now we have to do *exception handling* (gasp). You might be wondering what good are python generators at helping us to simplify memory usage when this style of coding makes the workflow more complex. We could have relied on python's Iterator objects (covered next), to do lazy reading."
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "markdown",
307 |    "metadata": {},
308 |    "source": [
309 |     "Fortunatly, there is a generalized version of yield, $\\color{green}{\\textbf{yield from}}$ for these siuations:"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 26,
315 |    "metadata": {},
316 |    "outputs": [],
317 |    "source": [
318 |     "def load_from_pickle(fn):\n",
319 |     "    with open(fn, \"rb\") as fid:\n",
320 |     "        yield from pickle.load(fid)"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": 27,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "for row in load_from_pickle(fn_pkl):\n",
330 |     "    continue"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": 28,
336 |    "metadata": {},
337 |    "outputs": [],
338 |    "source": [
339 |     "# Finishes without error. "
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "markdown",
344 |    "metadata": {},
345 |    "source": [
346 |     "In short, generators allow us to write simple code that helps to simplfy memory usage when working with large data files."
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "markdown",
351 |    "metadata": {},
352 |    "source": [
353 |     "##### The Python Iterable"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "markdown",
358 |    "metadata": {},
359 |    "source": [
360 |     "Before generators were introduced, one relied on a python __Iterator__ object to produce lazy readers. Iterable classes are not too difficult to write, but they have dependencies, in particular, they must contain the $\\color{blue}{\\textbf{__iter__}}$ and $\\color{blue}{\\textbf{__next__}}$ \"thunder\" methods. \n",
361 |     "\n",
362 |     "A simple example with our serialized (pickled) data from above:"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": 44,
368 |    "metadata": {},
369 |    "outputs": [],
370 |    "source": [
371 |     "class read_from_pickle_iterable:\n",
372 |     "    \n",
373 |     "    def __init__(self, fn):\n",
374 |     "        self.fn = fn\n",
375 |     "        self.fid = open(self.fn, \"rb\")\n",
376 |     "        \n",
377 |     "    def __iter__(self):\n",
378 |     "        return self\n",
379 |     "    \n",
380 |     "    def __next__(self):\n",
381 |     "        try:\n",
382 |     "            return pickle.load(self.fid)\n",
383 |     "        except EOFError:\n",
384 |     "            raise StopIteration"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "markdown",
389 |    "metadata": {},
390 |    "source": [
391 |     "The thunder method $\\color{blue}{\\textbf{__iter__}}$ returns the object itself (through self!), while $\\color{blue}{\\textbf{__next__}}$ is used to return the result of the .load call on the opened file."
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": 49,
397 |    "metadata": {},
398 |    "outputs": [],
399 |    "source": [
400 |     "rfpi = read_from_pickle_iterable(fn_pkl)"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "markdown",
405 |    "metadata": {},
406 |    "source": [
407 |     "Using the Iterator's $\\color{blue}{\\textbf{__next__}}$ functionality, we then grab the lines from the file one-by-one without opening the entire file:"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": 51,
413 |    "metadata": {},
414 |    "outputs": [
415 |     {
416 |      "ename": "StopIteration",
417 |      "evalue": "",
418 |      "output_type": "error",
419 |      "traceback": [
420 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
421 |       "\u001b[0;31mEOFError\u001b[0m                                  Traceback (most recent call last)",
422 |       "\u001b[0;32m<ipython-input-44-1e1bfab26979>\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     11\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     13\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mEOFError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
423 |       "\u001b[0;31mEOFError\u001b[0m: Ran out of input",
424 |       "\nDuring handling of the above exception, another exception occurred:\n",
425 |       "\u001b[0;31mStopIteration\u001b[0m                             Traceback (most recent call last)",
426 |       "\u001b[0;32m<ipython-input-51-d7c5bd59b55e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m     \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrfpi\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# Use next like this\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
427 |       "\u001b[0;32m<ipython-input-44-1e1bfab26979>\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     12\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     13\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mEOFError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 14\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
428 |       "\u001b[0;31mStopIteration\u001b[0m: "
429 |      ]
430 |     }
431 |    ],
432 |    "source": [
433 |     "while True:\n",
434 |     "    next(rfpi) # Use next like this"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "markdown",
439 |    "metadata": {},
440 |    "source": [
441 |     "which dies as intended when the $\\color{red}{\\textbf{EOFError}}$ and the $\\color{red}{\\textbf{StopException}}$ is thrown using the $\\color{green}{\\textbf{raise}}$ clause. When the iterator object is iterated out, for example rolled out in a for loop (or by list(), enumerate(), etc) it will exit without error:"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": 52,
447 |    "metadata": {},
448 |    "outputs": [],
449 |    "source": [
450 |     "for line in read_from_pickle_iterable(fn_pkl):\n",
451 |     "    continue"
452 |    ]
453 |   },
454 |   {
455 |    "cell_type": "markdown",
456 |    "metadata": {},
457 |    "source": [
458 |     "### Why even have generators when there are already iterators?"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "markdown",
463 |    "metadata": {},
464 |    "source": [
465 |     "The answer is because generators are more compact and easier to write, as you do not have to explicitly sub-class them with the $\\color{blue}{\\textbf{__iter__}}$ and $\\color{blue}{\\textbf{__next__}}$. That's taken care of under-the-hood with the generator class. But the converse is not true: iterables do not have the yield capability."
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "markdown",
470 |    "metadata": {},
471 |    "source": [
472 |     "### When should I use a generator rather than a method?"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "markdown",
477 |    "metadata": {},
478 |    "source": [
479 |     "There are lots of scenerios, in addition to data loading! Note that in the first method example above, a list was returned. Do you need all elements of the list, all at the same time? If the answer is no, then you want to try to use a generator if that file is large in size!\n",
480 |     "\n",
481 |     "With generators one tries to balance the time spent performing operations with/on the data with memory utilization. Using them will often be determined by how your program is designed to run and utilize resources. If you are presented with a significant memory bottleneck, generators are very often the way to go. However if your program does not have said memory issue, using a generator might result in the program running significantly slower. \n",
482 |     "\n",
483 |     "As you use generators more and more in your work-flow, you will learn to apply them when they are needed and when to avoid them when they do not offer any benefit over using methods."
484 |    ]
485 |   },
486 |   {
487 |    "cell_type": "markdown",
488 |    "metadata": {},
489 |    "source": [
490 |     "Feel free to email me (John Schreck, schreck@ucar.edu) with any questions / mistakes / whatever!"
491 |    ]
492 |   }
493 |  ],
494 |  "metadata": {
495 |   "kernelspec": {
496 |    "display_name": "Python 3",
497 |    "language": "python",
498 |    "name": "python3"
499 |   },
500 |   "language_info": {
501 |    "codemirror_mode": {
502 |     "name": "ipython",
503 |     "version": 3
504 |    },
505 |    "file_extension": ".py",
506 |    "mimetype": "text/x-python",
507 |    "name": "python",
508 |    "nbconvert_exporter": "python",
509 |    "pygments_lexer": "ipython3",
510 |    "version": "3.8.6"
511 |   }
512 |  },
513 |  "nbformat": 4,
514 |  "nbformat_minor": 4
515 | }
516 | 


--------------------------------------------------------------------------------
/blog/site/slurm.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# (12/22/20) Helpful SLURM Commands\n",
  8 |     "David John Gagne\n",
  9 |     "\n",
 10 |     "SLURM is currently the scheduler on the Casper cluster, which means it is used to manage the queueing and scheduling of jobs. You are likely very familiar with sbatch and squeue at this point. SLURM also has a whole suite of other commands that give you an incredibly detailed view into the usage of the cluster by yourself and everyone else. This blog will provide some insights to help you better manage your own jobs and keep track of how busy Casper is."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "## Track your job memory and CPU usage with sacct\n",
 18 |     "`sacct` queries the SLURM scheduler database to find out how well you or any other user has utilized their requested resources on a job by job basis. The default output of sacct is not very useful, but with a few alterations to the command, you can get a wealth of information.\n",
 19 |     "\n",
 20 |     "I recommend running sacct in the following format (Note that ! allows you to run a command line program within a notebook. Do not copy the ! if you want to use the command in the terminal window):"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 1,
 26 |    "metadata": {
 27 |     "execution": {
 28 |      "iopub.execute_input": "2020-12-22T19:47:31.923723Z",
 29 |      "iopub.status.busy": "2020-12-22T19:47:31.923313Z",
 30 |      "iopub.status.idle": "2020-12-22T19:47:32.129176Z",
 31 |      "shell.execute_reply": "2020-12-22T19:47:32.128743Z"
 32 |     }
 33 |    },
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "     User    JobName        JobID AllocNodes  ReqCPUS    Elapsed    CPUTime   TotalCPU     ReqMem     MaxRSS ExitCode      State \r\n",
 40 |       "--------- ---------- ------------ ---------- -------- ---------- ---------- ---------- ---------- ---------- -------- ---------- \r\n",
 41 |       "   dgagne        sfc 6249373               1       16   00:00:06   00:01:36  00:00.683      128Gn                 1:0     FAILED \r\n",
 42 |       "               batch 6249373.bat+          1       16   00:00:06   00:01:36  00:00.682      128Gn          0      1:0     FAILED \r\n",
 43 |       "              extern 6249373.ext+          1       16   00:00:06   00:01:36   00:00:00      128Gn          0      0:0  COMPLETED \r\n",
 44 |       "   dgagne        sfc 6249377               1       16   00:00:39   00:10:24  00:12.017      128Gn                 1:0     FAILED \r\n",
 45 |       "               batch 6249377.bat+          1       16   00:00:39   00:10:24  00:12.016      128Gn      0.35G      1:0     FAILED \r\n",
 46 |       "              extern 6249377.ext+          1       16   00:00:39   00:10:24  00:00.001      128Gn          0      0:0  COMPLETED \r\n",
 47 |       "   dgagne        sfc 6249380               1       16   00:00:23   00:06:08  00:11.773      128Gn                 1:0     FAILED \r\n",
 48 |       "               batch 6249380.bat+          1       16   00:00:23   00:06:08  00:11.772      128Gn          0      1:0     FAILED \r\n",
 49 |       "              extern 6249380.ext+          1       16   00:00:23   00:06:08  00:00.001      128Gn          0      0:0  COMPLETED \r\n",
 50 |       "   dgagne        sfc 6249390               1       16   00:08:07   02:09:52  21:33.130      128Gn                 0:0  COMPLETED \r\n",
 51 |       "               batch 6249390.bat+          1       16   00:08:07   02:09:52  21:33.129      128Gn      1.40G      0:0  COMPLETED \r\n",
 52 |       "              extern 6249390.ext+          1       16   00:08:08   02:10:08   00:00:00      128Gn          0      0:0  COMPLETED \r\n",
 53 |       "   dgagne        sfc 6250928               1       16   00:01:47   00:28:32  01:21.477      128Gn                 1:0     FAILED \r\n",
 54 |       "               batch 6250928.bat+          1       16   00:01:47   00:28:32  01:21.476      128Gn      0.90G      1:0     FAILED \r\n",
 55 |       "              extern 6250928.ext+          1       16   00:01:47   00:28:32  00:00.001      128Gn          0      0:0  COMPLETED \r\n",
 56 |       "   dgagne        sfc 6250961               1       16   00:01:21   00:21:36  01:19.700      128Gn                 1:0     FAILED \r\n",
 57 |       "               batch 6250961.bat+          1       16   00:01:21   00:21:36  01:19.699      128Gn      0.90G      1:0     FAILED \r\n",
 58 |       "              extern 6250961.ext+          1       16   00:01:21   00:21:36  00:00.001      128Gn          0      0:0  COMPLETED \r\n",
 59 |       "   dgagne        sfc 6251023               1       16   00:01:58   00:31:28  01:23.110      128Gn                 1:0     FAILED \r\n",
 60 |       "               batch 6251023.bat+          1       16   00:01:58   00:31:28  01:23.109      128Gn      0.90G      1:0     FAILED \r\n",
 61 |       "              extern 6251023.ext+          1       16   00:01:58   00:31:28  00:00.001      128Gn          0      0:0  COMPLETED \r\n",
 62 |       "   dgagne        sfc 6251026               1       16   00:04:27   01:11:12  09:42.497      128Gn                 1:0     FAILED \r\n",
 63 |       "               batch 6251026.bat+          1       16   00:04:27   01:11:12  09:42.496      128Gn      1.16G      1:0     FAILED \r\n",
 64 |       "              extern 6251026.ext+          1       16   00:04:27   01:11:12   00:00:00      128Gn          0      0:0  COMPLETED \r\n",
 65 |       "   dgagne        sfc 6257007               1       16   00:05:45   01:32:00  13:28.988      128Gn                 1:0     FAILED \r\n",
 66 |       "               batch 6257007.bat+          1       16   00:05:45   01:32:00  13:28.987      128Gn      1.24G      1:0     FAILED \r\n",
 67 |       "              extern 6257007.ext+          1       16   00:05:45   01:32:00  00:00.001      128Gn          0      0:0  COMPLETED \r\n",
 68 |       "   dgagne        sfc 6257047               1       16   00:00:03   00:00:48  00:03.086      128Gn                 1:0     FAILED \r\n",
 69 |       "               batch 6257047.bat+          1       16   00:00:03   00:00:48  00:03.085      128Gn          0      1:0     FAILED \r\n",
 70 |       "              extern 6257047.ext+          1       16   00:00:03   00:00:48   00:00:00      128Gn          0      0:0  COMPLETED \r\n",
 71 |       "   dgagne    casp_nb 6266629               1       12   06:00:00 3-00:00:00  00:22.151      256Gn                 0:0    TIMEOUT \r\n",
 72 |       "               batch 6266629.bat+          1       12   06:00:01 3-00:00:12  00:22.150      256Gn      0.30G     0:15  CANCELLED \r\n",
 73 |       "              extern 6266629.ext+          1       12   06:00:00 3-00:00:00  00:00.001      256Gn          0      0:0  COMPLETED \r\n",
 74 |       "   dgagne        sfc 6295916               1       16   00:03:06   00:49:36  05:27.670      128Gn                 1:0     FAILED \r\n",
 75 |       "               batch 6295916.bat+          1       16   00:03:06   00:49:36  05:27.669      128Gn      1.06G      1:0     FAILED \r\n",
 76 |       "              extern 6295916.ext+          1       16   00:03:06   00:49:36   00:00:00      128Gn          0      0:0  COMPLETED \r\n",
 77 |       "   dgagne        sfc 6295929               1       16   00:11:13   02:59:28  28:03.125      128Gn                 0:0  COMPLETED \r\n",
 78 |       "               batch 6295929.bat+          1       16   00:11:13   02:59:28  28:03.124      128Gn      1.38G      0:0  COMPLETED \r\n",
 79 |       "              extern 6295929.ext+          1       16   00:11:13   02:59:28  00:00.001      128Gn          0      0:0  COMPLETED \r\n",
 80 |       "   dgagne   htrainrt 6316207               1       30   00:10:11   05:05:30  48:06.423      200Gn                 0:0  COMPLETED \r\n",
 81 |       "               batch 6316207.bat+          1       30   00:10:11   05:05:30  48:06.422      200Gn     56.67G      0:0  COMPLETED \r\n",
 82 |       "              extern 6316207.ext+          1       30   00:10:11   05:05:30  00:00.001      200Gn          0      0:0  COMPLETED \r\n",
 83 |       "   dgagne   htrainrt 6316247               1       30   00:44:20   22:10:00   02:20:53      200Gn                 0:0  COMPLETED \r\n",
 84 |       "               batch 6316247.bat+          1       30   00:44:20   22:10:00   02:20:52      200Gn    104.03G      0:0  COMPLETED \r\n",
 85 |       "              extern 6316247.ext+          1       30   00:44:21   22:10:30  00:01.001      200Gn      0.00G      0:0  COMPLETED \r\n",
 86 |       "   dgagne    casp_nb 6319681               1        4   00:00:33   00:02:12  00:00.286       64Gn                 0:0  COMPLETED \r\n",
 87 |       "               batch 6319681.bat+          1        4   00:00:33   00:02:12  00:00.285       64Gn      0.05G      0:0  COMPLETED \r\n",
 88 |       "              extern 6319681.ext+          1        4   00:00:33   00:02:12  00:00.001       64Gn          0      0:0  COMPLETED \r\n",
 89 |       "   dgagne    casp_nb 6319684               1        8   00:00:32   00:04:16  00:00.280       64Gn                 0:0  COMPLETED \r\n",
 90 |       "               batch 6319684.bat+          1        8   00:00:32   00:04:16  00:00.280       64Gn      0.05G      0:0  COMPLETED \r\n",
 91 |       "              extern 6319684.ext+          1        8   00:00:32   00:04:16   00:00:00       64Gn          0      0:0  COMPLETED \r\n"
 92 |      ]
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "! sacct --units=G --format=\"User,JobName,JobID,AllocNodes,ReqCPUs,Elapsed,CPUTime,TotalCPU,ReqMem,MaxRSS,ExitCode,State\" -S 2020-12-01 -E 2020-12-31 -u dgagne"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "The command breaks down into these parts:\n",
104 |     "- `--units=G`: Print all memory-related outputs in Gigabytes. You can also use M or K for Megabytes and Kilobytes\n",
105 |     "- `--format=\"...\"`: The list of columns to output. The full list can be found [here](https://slurm.schedmd.com/sacct.html). \n",
106 |     "- `-S 2020-12-01`: The start date for the query. Can be adjusted so only recent jobs are visible.\n",
107 |     "- `-E 2020-12-31`: The end date for the query. \n",
108 |     "- `-u dgagne`: The username. Can be a comma separated list of users, like `-u dgagne,cbecker,schreck,ggantos`\n",
109 |     "\n",
110 |     "What does the output mean? The most relevant comparisons relate to CPU and memory usage. \n",
111 |     "- Elapsed: total time the job runs in Day-Hour:Minute:Second format.\n",
112 |     "- CPUTime: total time the CPUs are allocated, which should be close to Elapsed * ReqCPUs. \n",
113 |     "- TotalCPU: The total amount of time the CPUs are in use by the user or the system. If this is far less than CPUTime, then you are requesting too many CPUs for your job. Note that TotalCPU does not account for child processes, so if you are running multiprocessing or dask, this number may be deceptively low. \n",
114 |     "\n",
115 |     "For memory usage\n",
116 |     "- ReqMem: The total amount of memory the job requested.\n",
117 |     "- MaxRSS: The maximum amount of memory the job used. If MaxRSS is far less than ReqMem, then decrease future memory requests. If it is the same or close to the same as ReqMem and your job is taking a longer than expected time to run, the program may be swapping memory to disk. You should ask for more memory in that case. "
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "## Track current cluster usage with sinfo\n",
125 |     "`sinfo` prints out information about the current usage of every node in the cluster. It is helpful to see which nodes have what resources, and you can see how busy each node is. This may be especially useful when you are about to launch a multi-GPU or large memory job and want to make sure the memory and GPUs are available. The default `sinfo` call provides a very high level summary. Just like `sacct`, I recommend running the following command:\n"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 2,
131 |    "metadata": {
132 |     "execution": {
133 |      "iopub.execute_input": "2020-12-22T19:47:32.132826Z",
134 |      "iopub.status.busy": "2020-12-22T19:47:32.132459Z",
135 |      "iopub.status.idle": "2020-12-22T19:47:32.281862Z",
136 |      "shell.execute_reply": "2020-12-22T19:47:32.281505Z"
137 |     }
138 |    },
139 |    "outputs": [
140 |     {
141 |      "name": "stdout",
142 |      "output_type": "stream",
143 |      "text": [
144 |       "HOSTNAMES      AVAIL_FEATURES                                    CPUS CPU_LOAD  GRES                          GRES_USED                                                ALLOCMEM       FREE_MEM  STATE            AVAIL   \r\n",
145 |       "casper23       casper,skylake,mlx5_0,gp100,gpu,x11               72   0.16      gpu:gp100:1                   gpu:gp100:0(IDX:N/A),mps:0                                      0         342060  drained          up      \r\n",
146 |       "casper20       casper,skylake,mlx5_0                             72   0.56      (null)                        gpu:0,mps:0                                                     0         295669  reserved         up      \r\n",
147 |       "casper25       casper,skylake,mlx5_0,4xv100,v100,gpu             72   0.04      gpu:v100:4,mps:v100:400       gpu:v100:0(IDX:N/A),mps:v100:0(IDX:N/A)                         0         734630  reserved         up      \r\n",
148 |       "casper28       casper,skylake,mlx5_0,8xv100,v100,gpu             72   0.01      gpu:v100:8,mps:v100:800       gpu:v100:0(IDX:N/A),mps:v100:0(IDX:N/A)                         0        1123240  reserved         up      \r\n",
149 |       "casper01       casper,skylake,mlx5_0                             72   1.39      (null)                        gpu:0,mps:0                                                247808         217182  mixed            up      \r\n",
150 |       "casper02       casper,skylake,mlx5_0                             72   0.62      (null)                        gpu:0,mps:0                                                380044         335151  mixed            up      \r\n",
151 |       "casper03       casper,skylake,mlx5_0                             72   17.92     (null)                        gpu:0,mps:0                                                382534         325258  mixed            up      \r\n",
152 |       "casper04       casper,skylake,mlx5_0                             72   2.29      (null)                        gpu:0,mps:0                                                379904         305334  mixed            up      \r\n",
153 |       "casper05       casper,skylake,mlx5_0                             72   18.23     (null)                        gpu:0,mps:0                                                374784         314173  mixed            up      \r\n",
154 |       "casper06       casper,skylake,mlx5_0,gp100,gpu,x11               72   3.21      gpu:gp100:1                   gpu:gp100:0(IDX:N/A),mps:0                                 355328         337188  mixed            up      \r\n",
155 |       "casper07       casper,skylake,mlx5_0,gp100,gpu,x11               72   3.14      gpu:gp100:1                   gpu:gp100:0(IDX:N/A),mps:0                                 310854         341183  mixed            up      \r\n",
156 |       "casper09       casper,skylake,mlx5_0,4xv100,v100,gpu             72   3.14      gpu:v100:4,mps:v100:400       gpu:v100:4(IDX:0-3),mps:v100:0(IDX:N/A)                    349452         649024  mixed            up      \r\n",
157 |       "casper10       casper,skylake,mlx5_0                             72   5.44      (null)                        gpu:0,mps:0                                                370968         332526  mixed            up      \r\n",
158 |       "casper11       casper,skylake,mlx5_0                             72   3.79      (null)                        gpu:0,mps:0                                                384140         336050  mixed            up      \r\n",
159 |       "casper12       casper,skylake,mlx5_0                             72   3.43      (null)                        gpu:0,mps:0                                                381952         325012  mixed            up      \r\n",
160 |       "casper13       casper,skylake,mlx5_0                             72   3.70      (null)                        gpu:0,mps:0                                                384582         333591  mixed            up      \r\n",
161 |       "casper14       casper,skylake,mlx5_0,gp100,gpu,x11               72   3.27      gpu:gp100:1                   gpu:gp100:0(IDX:N/A),mps:0                                 358982         342191  mixed            up      \r\n",
162 |       "casper15       casper,skylake,mlx5_0,gp100,gpu,x11               72   2.29      gpu:gp100:1                   gpu:gp100:0(IDX:N/A),mps:0                                 307200         341551  mixed            up      \r\n",
163 |       "casper16       casper,skylake,mlx5_0,gp100,gpu,x11               72   1.20      gpu:gp100:1                   gpu:gp100:0(IDX:N/A),mps:0                                 204800         344260  mixed            up      \r\n",
164 |       "casper17       casper,skylake,mlx5_0,gp100,gpu,x11               72   1.28      gpu:gp100:1                   gpu:gp100:0(IDX:N/A),mps:0                                 364544         343480  mixed            up      \r\n",
165 |       "casper18       casper,skylake,mlx5_0                             72   17.04     (null)                        gpu:0,mps:0                                                382608         248991  mixed            up      \r\n",
166 |       "casper19       casper,skylake,mlx5_0                             72   31.76     (null)                        gpu:0,mps:0                                                352150         259655  mixed            up      \r\n",
167 |       "casper22       casper,skylake,mlx5_0,gp100,gpu,x11               72   3.19      gpu:gp100:1                   gpu:gp100:0(IDX:N/A),mps:0                                 310854         336178  mixed            up      \r\n",
168 |       "casper24       casper,skylake,mlx5_0,8xv100,v100,gpu             72   10.09     gpu:v100:8,mps:v100:800       gpu:v100:8(IDX:0-7),mps:v100:0(IDX:N/A)                    256000        1053921  mixed            up      \r\n",
169 |       "casper26       casper,skylake,mlx5_0,gp100,gpu,x11               72   3.68      gpu:gp100:1                   gpu:gp100:0(IDX:N/A),mps:0                                 323142         296209  mixed            up      \r\n",
170 |       "casper27       casper,skylake,mlx5_0,8xv100,v100,gpu             72   0.04      gpu:v100:8,mps:v100:800       gpu:v100:0(IDX:N/A),mps:v100:0(IDX:N/A)                    307200        1115535  mixed            up      \r\n",
171 |       "casper29       casper,cascadelake,mlx5_0,4xv100,v100,gpu         72   0.84      gpu:v100:4,mps:v100:400       gpu:v100:3(IDX:0-2),mps:v100:0(IDX:N/A)                    277062         728882  mixed            up      \r\n",
172 |       "casper30       casper,cascadelake,mlx5_0,8xv100,v100,gpu         72   8.40      gpu:v100:8,mps:v100:800       gpu:v100:8(IDX:0-7),mps:v100:0(IDX:N/A)                     51200        1060850  mixed            up      \r\n",
173 |       "casper31       casper,cascadelake,mlx5_0,8xv100,v100,gpu         72   8.32      gpu:v100:8,mps:v100:800       gpu:v100:8(IDX:0-7),mps:v100:0(IDX:N/A)                     51200        1059902  mixed            up      \r\n",
174 |       "casper36       casper,cascadelake,mlx5_0,4xv100,v100,gpu         72   52.14     gpu:v100:4,mps:v100:400       gpu:v100:2(IDX:0,2),mps:v100:0(IDX:N/A)                    671744         641858  mixed            up      \r\n",
175 |       "casper21       casper,skylake,mlx5_0                             72   20.37     (null)                        gpu:0,mps:0                                                373532         161434  allocated        up      \r\n",
176 |       "casper08       casper,skylake,mlx5_0,8xv100,v100,gpu             72   0.01      gpu:v100:8,mps:v100:800       gpu:v100:0(IDX:N/A),mps:v100:0(IDX:N/A)                         0        1114594  idle             up      \r\n",
177 |       "gladeslurm1    hsi                                               16   11.48     (null)                        gpu:0,mps:0                                                     0          16338  idle             up      \r\n",
178 |       "gladeslurm2    hsi                                               16   14.10     (null)                        gpu:0,mps:0                                                     0          15644  idle             up      \r\n",
179 |       "gladeslurm3    hsi                                               16   5.25      (null)                        gpu:0,mps:0                                                     0          12911  idle             up      \r\n",
180 |       "gladeslurm4    hsi                                               16   4.20      (null)                        gpu:0,mps:0                                                     0          14423  idle             up      \r\n"
181 |      ]
182 |     }
183 |    ],
184 |    "source": [
185 |     "! sinfo --Format=\"NodeHost:15,Features:50,CPUs:5,CPUsLoad:10,Gres:30,GresUsed:50,AllocMem:.15,FreeMem:.15  ,StateLong:15,Available:6\""
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "The columns perform the following uses:\n",
193 |     "- NodeHost: Prints the name of each node.\n",
194 |     "- Features: Lists the CPU type (skylake or cascadelake), and the number and type of GPU if any\n",
195 |     "- CPUs: Number of CPUs available, which is number of sockets * number of cores * threads per core (only for multithreading)\n",
196 |     "- CPUsLoad: How many CPUs are currently being used\n",
197 |     "- Gres: Number and type of GPUs\n",
198 |     "- GresUsed: How many GPUs are currently allocated on the node\n",
199 |     "- AllocMem: How much memory is allocated in MB\n",
200 |     "- FreeMem: How much memory is free in MB\n",
201 |     "- StateLong: Node usage, which can be idle, mixed, allocated, reserved, or drained\n",
202 |     "- Available: up or down"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": []
211 |   }
212 |  ],
213 |  "metadata": {
214 |   "kernelspec": {
215 |    "display_name": "Python 3",
216 |    "language": "python",
217 |    "name": "python3"
218 |   },
219 |   "language_info": {
220 |    "codemirror_mode": {
221 |     "name": "ipython",
222 |     "version": 3
223 |    },
224 |    "file_extension": ".py",
225 |    "mimetype": "text/x-python",
226 |    "name": "python",
227 |    "nbconvert_exporter": "python",
228 |    "pygments_lexer": "ipython3",
229 |    "version": "3.8.6"
230 |   }
231 |  },
232 |  "nbformat": 4,
233 |  "nbformat_minor": 4
234 | }
235 | 


--------------------------------------------------------------------------------