├── aimlutils ├── __init__.py ├── data │ ├── __init__.py │ └── splitting.py ├── echo │ ├── __init__.py │ ├── src │ │ ├── __init__.py │ │ ├── pruners.py │ │ ├── trial_suggest.py │ │ ├── samplers.py │ │ └── base_objective.py │ ├── examples │ │ ├── keras │ │ │ ├── launch.sh │ │ │ ├── model_config.yml │ │ │ ├── hyperparameter.yml │ │ │ ├── objective.py │ │ │ ├── data_generator.py │ │ │ └── model.py │ │ └── torch │ │ │ ├── model.yml │ │ │ ├── hyperparameter.yml │ │ │ └── objective.py │ ├── report.py │ ├── run.py │ ├── README.ipynb │ ├── README.md │ └── optimize.py ├── torch │ ├── __init__.py │ ├── losses │ │ ├── __init__.py │ │ └── losses.py │ ├── models │ │ └── __init__.py │ ├── trainers │ │ ├── __init__.py │ │ └── trainers.py │ ├── checkpoint │ │ ├── __init__.py │ │ └── checkpointer.py │ └── optimizers │ │ ├── __init__.py │ │ └── optimizers.py └── utils │ ├── __init__.py │ ├── tqdm.py │ └── gpu.py ├── __version__.py ├── blog ├── site │ ├── generators │ │ └── test_data.pkl │ ├── memory_images │ │ ├── complex.png │ │ ├── test.py.png │ │ ├── complex_plot.png │ │ ├── example.py.png │ │ ├── example.py_m.png │ │ ├── mprofile.dat.png │ │ ├── mprof_run_plot.png │ │ ├── test.py_output.png │ │ ├── example.py_output.png │ │ └── example.py_m_output.png │ ├── _toc.yml │ ├── home.md │ ├── howto.md │ ├── _config.yml │ ├── memory.ipynb │ ├── optuna_mariadb.ipynb │ ├── generators.ipynb │ └── slurm.ipynb ├── build.sh ├── publish.sh └── NCAR_UCAR_LIVERY │ └── Logos │ └── Contemporary Logos │ └── NCAR │ └── Dark Logo │ └── NCAR-contemp-logo-blue.png ├── .gitignore ├── requirements.txt ├── README.md └── setup.py /aimlutils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aimlutils/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aimlutils/echo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aimlutils/torch/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aimlutils/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aimlutils/echo/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /__version__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.0.1' -------------------------------------------------------------------------------- /aimlutils/torch/losses/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aimlutils/torch/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aimlutils/torch/trainers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aimlutils/torch/checkpoint/__init__.py: -------------------------------------------------------------------------------- 1 | from aimlutils.torch.checkpoint.checkpointer import * 2 | -------------------------------------------------------------------------------- /aimlutils/torch/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | from aimlutils.torch.optimizers.optimizers import * 2 | -------------------------------------------------------------------------------- /blog/site/generators/test_data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/generators/test_data.pkl -------------------------------------------------------------------------------- /blog/site/memory_images/complex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/memory_images/complex.png -------------------------------------------------------------------------------- /blog/site/memory_images/test.py.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/memory_images/test.py.png -------------------------------------------------------------------------------- /blog/site/memory_images/complex_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/memory_images/complex_plot.png -------------------------------------------------------------------------------- /blog/site/memory_images/example.py.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/memory_images/example.py.png -------------------------------------------------------------------------------- /blog/site/memory_images/example.py_m.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/memory_images/example.py_m.png -------------------------------------------------------------------------------- /blog/site/memory_images/mprofile.dat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/memory_images/mprofile.dat.png -------------------------------------------------------------------------------- /blog/site/memory_images/mprof_run_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/memory_images/mprof_run_plot.png -------------------------------------------------------------------------------- /blog/site/memory_images/test.py_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/memory_images/test.py_output.png -------------------------------------------------------------------------------- /blog/site/memory_images/example.py_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/memory_images/example.py_output.png -------------------------------------------------------------------------------- /blog/site/memory_images/example.py_m_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/site/memory_images/example.py_m_output.png -------------------------------------------------------------------------------- /blog/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Clean out the current build 4 | jupyter-book clean site/_build 5 | # Build the site with jupyer-book 6 | jupyter-book build site 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | blog/site/_build/* 2 | .ipynb_checkpoints/* 3 | */.ipynb_checkpoints/* 4 | */*/.ipynb_checkpoints/* 5 | */*/*/.ipynb_checkpoints/* 6 | blog/NCAR_UCAR_LIVERY/* 7 | blog/.DS_Store -------------------------------------------------------------------------------- /blog/publish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Publish the site on GitHub 4 | ghp-import -n -p -f site/_build/html 5 | 6 | # Print the site domain address for convenience 7 | echo "https://ncar.github.io/aiml-utils/home.html" 8 | -------------------------------------------------------------------------------- /blog/site/_toc.yml: -------------------------------------------------------------------------------- 1 | - file: home 2 | - file: howto 3 | - file: optuna_mariadb.ipynb 4 | - file: slurm.ipynb 5 | - file: memory.ipynb 6 | - file: callbacks.ipynb 7 | - file: data_loaders.ipynb 8 | - file: generators.ipynb -------------------------------------------------------------------------------- /blog/NCAR_UCAR_LIVERY/Logos/Contemporary Logos/NCAR/Dark Logo/NCAR-contemp-logo-blue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NCAR/aiml-utils/master/blog/NCAR_UCAR_LIVERY/Logos/Contemporary Logos/NCAR/Dark Logo/NCAR-contemp-logo-blue.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | optuna 4 | matplotlib 5 | tensorflow 6 | torch 7 | torchvision 8 | pyyaml 9 | scipy 10 | xarray 11 | netcdf4 12 | jupyter 13 | jupyter-book 14 | ghp-import 15 | sphinxcontrib-bibtex<2.0.0 16 | -------------------------------------------------------------------------------- /aimlutils/utils/tqdm.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm as tqdm_base 2 | 3 | def tqdm(*args, **kwargs): 4 | if hasattr(tqdm_base, '_instances'): 5 | for instance in list(tqdm_base._instances): 6 | tqdm_base._decr_instances(instance) 7 | return tqdm_base(*args, **kwargs) 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # aiml-utils 2 | This repository contains: 3 | 4 | - The AIML group's jupyter-books blog website (blog/). [See also here](https://ncar.github.io/aiml-utils/home.html). 5 | 6 | - **E**arth **C**omputing **H**yperparameter **O**ptimization, ECHO (aimlutils/echo) 7 | 8 | - Utilities that are shared across different projects (aimlutils/[utils,data,torch]) -------------------------------------------------------------------------------- /aimlutils/echo/examples/keras/launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH --account=NAML0001 3 | #SBATCH --gres=gpu:v100:1 4 | #SBATCH --mem=128G 5 | #SBATCH -n 8 6 | #SBATCH -t 12:00:00 7 | #SBATCH -J hyper_opt 8 | #SBATCH -o hyper_opt.out 9 | #SBATCH -e hyper_opt.err 10 | module load ncarenv/1.3 gnu/8.3.0 openmpi/3.1.4 python/3.7.5 cuda/10.1 11 | ncar_pylib 12 | python run.py examples/keras/hyperparameter.yml examples/keras/model_config.yml 13 | -------------------------------------------------------------------------------- /aimlutils/utils/gpu.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | def gpu_report(): 4 | """Get the current gpu usage. 5 | 6 | Returns 7 | ------- 8 | usage: dict 9 | Keys are device ids as integers. 10 | Values are memory usage as integers in MB. 11 | """ 12 | cmd = ['nvidia-smi', '--query-gpu=memory.free','--format=csv,nounits,noheader'] 13 | result = subprocess.check_output(cmd) 14 | result = result.decode('utf-8') 15 | # Convert lines into a dictionary 16 | gpu_memory = [int(x) for x in result.strip().split('\n')] 17 | gpu_memory_map = dict(zip(range(len(gpu_memory)), gpu_memory)) 18 | return gpu_memory_map 19 | -------------------------------------------------------------------------------- /aimlutils/echo/src/pruners.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | 4 | import sys 5 | import optuna 6 | import logging 7 | from tensorflow.keras.callbacks import Callback 8 | 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class KerasPruningCallback(Callback): 14 | 15 | def __init__(self, trial, monitor, interval = 1): 16 | # type: (optuna.trial.Trial, str) -> None 17 | 18 | super(KerasPruningCallback, self).__init__() 19 | 20 | self.trial = trial 21 | self.monitor = monitor 22 | self.interval = interval 23 | 24 | def on_epoch_end(self, epoch, logs=None): 25 | # type: (int, Dict[str, float]) -> None 26 | 27 | logs = logs or {} 28 | current_score = logs.get(self.monitor) 29 | if current_score is None: 30 | return 31 | self.trial.report(current_score, step=epoch) 32 | if self.trial.should_prune(): 33 | message = "Trial was pruned at epoch {}.".format(epoch) 34 | raise optuna.structs.TrialPruned(message) 35 | -------------------------------------------------------------------------------- /aimlutils/echo/examples/keras/model_config.yml: -------------------------------------------------------------------------------- 1 | path_data: "/glade/p/cisl/aiml/ai4ess_hackathon/holodec/" 2 | path_save: "examples/keras/results" 3 | model_name: "cnn" 4 | num_particles: 3 5 | random_seed: 328942 6 | output_cols: ["x", "y", "z", "d"] 7 | scaler_out: "StandardScaler" 8 | num_z_bins: 100 9 | subset: False 10 | conv2d_network: 11 | filters: [8, 12, 16] 12 | kernel_sizes: [5, 5, 5] 13 | conv2d_activation: "leakyrelu" 14 | pool_sizes: [5, 5, 5] 15 | pool_dropout: 0.5 16 | dense_sizes: [100, 50] 17 | dense_dropout: 0.5 18 | dense_activation: "leakyrelu" 19 | output_activation: "linear" 20 | lr: 0.0001 21 | optimizer: "adam" 22 | loss: "mse" 23 | metrics: ["mse", "mae"] 24 | batch_size: 128 25 | epochs: 100 26 | verbose: 1 27 | callbacks: 28 | ModelCheckpoint: 29 | monitor: "val_loss" 30 | filepath: "examples/keras/results/model.h5" 31 | save_best_only: True 32 | save_weights_only: True 33 | EarlyStopping: 34 | monitor: "val_loss" 35 | patience: 4 36 | ReduceLROnPlateau: 37 | monitor: "val_loss" 38 | factor: 0.2 39 | patience: 1 40 | min_lr: 0.0000001 41 | mode: "auto" 42 | CSVLogger: 43 | filename: "examples/keras/results/training.txt" 44 | separator: " " 45 | append: True 46 | -------------------------------------------------------------------------------- /aimlutils/echo/src/trial_suggest.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | 4 | import sys 5 | import optuna 6 | import logging 7 | 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | supported_trials = [ 13 | "categorical", 14 | "discrete_uniform", 15 | "float", 16 | "int", 17 | "loguniform", 18 | "uniform" 19 | ] 20 | 21 | 22 | def trial_suggest_loader(trial, config): 23 | 24 | try: 25 | _type = config["type"] 26 | if _type == "categorical": 27 | return trial.suggest_categorical(**config["settings"]) 28 | elif _type == "discrete_uniform": 29 | return int(trial.suggest_discrete_uniform(**config["settings"])) 30 | elif _type == "float": 31 | return float(trial.suggest_float(**config["settings"])) 32 | elif _type == "int": 33 | return int(trial.suggest_int(**config["settings"])) 34 | elif _type == "loguniform": 35 | return float(trial.suggest_loguniform(**config["settings"])) 36 | elif _type == "uniform": 37 | return float(trial.suggest_uniform(**config["settings"])) 38 | else: #if _type not in supported_trials: 39 | message = f"Type {_type} is not valid. Select from {supported_trials}" 40 | logger.warning(message) 41 | raise OSError(message) 42 | except Exception as E: 43 | print("FAILED IN TRIAL SUGGEST", E, config) 44 | raise OSError -------------------------------------------------------------------------------- /aimlutils/echo/examples/torch/model.yml: -------------------------------------------------------------------------------- 1 | log: 'examples/torch/test' 2 | type: "encoder-vae" 3 | 4 | data: 5 | path_data: "/glade/scratch/schreck/holodec/" 6 | num_particles: "50-100" 7 | maxnum_particles: 100 8 | output_cols: ["x", "y", "z", "d", "binary"] 9 | subset: False 10 | 11 | transforms: 12 | #RandomVerticalFlip: False 13 | #RandomHorizontalFlip: False 14 | Rescale: 600 15 | Normalize: 'norm' 16 | ToTensor: True 17 | 18 | iterator: 19 | num_workers: 8 20 | batch_size: 32 21 | pin_memory: True 22 | shuffle: True 23 | 24 | model: 25 | image_channels: 1 26 | hidden_dims: [3, 94, 141, 471, 425, 1122] 27 | z_dim: 1277 28 | dense_hidden_dims: [1000] 29 | dense_dropouts: [0.0] 30 | tasks: ["x", "y", "z", "d", "binary"] 31 | pretrained_model: "/glade/work/schreck/repos/holodec-ml/scripts/schreck/compressor/pretrained/pretrained.pt" 32 | 33 | optimizer: 34 | type: "lookahead-diffgrad" 35 | lr: 0.000631 36 | weight_decay: 0.0 37 | 38 | callbacks: 39 | MetricsLogger: 40 | path_save: "test" 41 | reload: False 42 | EarlyStopping: 43 | patience: 5 44 | verbose: True 45 | path_save: "examples/torch/test/checkpoint.pt" 46 | ExponentialLR: 47 | gamma: 0.95 48 | 49 | # ReduceLROnPlateau: 50 | # mode: "min" 51 | # factor: 0.2 52 | # patience: 1 53 | # min_lr: 0.0000000001 54 | # verbose: True 55 | 56 | trainer: 57 | start_epoch: 0 58 | epochs: 1 59 | clip: 1.0 60 | alpha: 1.0 61 | beta: 0.1 62 | path_save: "examples/torch/test" 63 | test_image: "examples/torch/test/image_600.pkl" -------------------------------------------------------------------------------- /aimlutils/data/splitting.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import train_test_split as _train_test_split 2 | from typing import List 3 | import pandas as pd 4 | import numpy as np 5 | 6 | # To do: Add documentation, a logger and verbose options 7 | 8 | def stratified_split(df: pd.DataFrame, 9 | frac: float, 10 | column: List[str]) -> (pd.DataFrame, pd.DataFrame): 11 | 12 | label_count = df[column].value_counts().to_dict() 13 | labels_we_can_use = df[column].apply(lambda x: label_count[x] > 1) 14 | items_with_count_one = df[~labels_we_can_use].copy() 15 | items_needing_split = df[labels_we_can_use].copy() 16 | 17 | train, test = _train_test_split( 18 | items_needing_split, 19 | test_size=frac, 20 | stratify=items_needing_split[column] 21 | ) 22 | train = pd.concat([train, items_with_count_one], axis = 0, sort = True)#.reset_index(drop = True) 23 | return train, test 24 | 25 | 26 | def train_test_split(df: pd.DataFrame, 27 | fraction: float = 0.2) -> (pd.DataFrame, pd.DataFrame): 28 | 29 | fraction = min(1.0, fraction) 30 | train, test = stratified_split(df, fraction, "label") 31 | return train, test 32 | 33 | 34 | def train_test_val_split(df: pd.DataFrame, 35 | fraction: float = 0.2) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame): 36 | 37 | fraction = min(1.0, fraction) 38 | train, _test = stratified_split(df, fraction, "label") 39 | test, val = stratified_split(_test, 0.5, "label") 40 | 41 | return train, test, val -------------------------------------------------------------------------------- /blog/site/home.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | Welcome to the AIML group's blog page! 4 | 5 | The goal of this site is to help current and future members of the AIML group at NCAR share programming lessons and tips. 6 | 7 | # About Us 8 | The NCAR Analytics and Integrative Machine Learning group develops machine elearning systems to improve our understanding and prediction of the many facets of Earth's systems. Our group works in close collaboration with domain scientists across NCAR's labs, the university community, and the private sector to develop and integrate machine learning tools into modeling and observation pipelines. Our group members are: 9 | 10 | * David John Gagne, Machine Learning Scientist II (CISL/RAL) 11 | * John Schreck, Machine Learning Scientist I (CISL) 12 | * Charlie Becker, Associate Scientist II (CISL) 13 | * Gabrielle Gantos, Associate Scientist II (CISL) 14 | * Maria Molina, Project Scientist I (CGD) 15 | * Zhonghua Zheng, Postdoctoral Fellow (CISL/CGD) 16 | * Will Chapman, Postdoctoral Fellow (CGD/CISL/RAL) 17 | * Mariana Cains, Postdoctoral Fellow (MMM) 18 | * Chris Wirz, Postdoctoral Fellow (MMM) 19 | * Keely Lawrence, Student Assistant III (CISL, University of Colorado) 20 | * Prahalath Bharathi, Student Assistant (CISL, Tufts University) 21 | 22 | # Sources 23 | Each blog published here is a Jupyter notebook, and you can download them from the [aiml_utils](https://github.com/NCAR/aiml-utils/tree/master/blog/site) repository hosted on the [NCAR GitHub group](https://github.com/NCAR) page. 24 | 25 | Please head over to the [How To](https://ncar.github.io/aiml-utils/howto.html) page for instructions on how to add a blog to this website. 26 | 27 | # License 28 | This work is licensed under a GNU General Public License v3.0 29 | 30 | 35 | -------------------------------------------------------------------------------- /aimlutils/echo/src/samplers.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | 4 | import sys 5 | import optuna 6 | import logging 7 | 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | supported_samplers = [ 13 | "TPESampler", 14 | "GridSampler", 15 | "RandomSampler", 16 | "CmaEsSampler", 17 | "IntersectionSearchSpace", 18 | "MOTPEMultiObjectiveSampler", 19 | "NSGAIIMultiObjectiveSampler", 20 | "RandomMultiObjectiveSampler" 21 | ] 22 | 23 | 24 | def samplers(sampler): 25 | _type = sampler.pop("type") 26 | if _type not in supported_samplers: 27 | message = f"Sampler {_type} is not valid. Select from {supported_samplers}" 28 | logger.warning(message) 29 | raise OSError(message) 30 | if _type == "TPESampler": 31 | return optuna.samplers.TPESampler(**sampler) 32 | elif _type == "GridSampler": 33 | if "search_space" not in sampler: 34 | raise OSError("You must provide search_space options with the GridSampler.") 35 | else: 36 | return optuna.samplers.GridSampler(**sampler) 37 | elif _type == "RandomSampler": 38 | return optuna.samplers.RandomSampler(**sampler) 39 | elif _type == "CmaEsSampler": 40 | return optuna.integration.CmaEsSampler(**sampler) 41 | elif _type == "IntersectionSearchSpace": 42 | return optuna.integration.IntersectionSearchSpace(**sampler) 43 | # support for multi-objective studies 44 | elif _type == "MOTPEMultiObjectiveSampler": 45 | return optuna.multi_objective.samplers.MOTPEMultiObjectiveSampler(**sampler) 46 | elif _type == "NSGAIIMultiObjectiveSampler": 47 | return optuna.multi_objective.samplers.NSGAIIMultiObjectiveSampler(**sampler) 48 | elif _type == "RandomMultiObjectiveSampler": 49 | return optuna.multi_objective.samplers.RandomMultiObjectiveSampler(**sampler) -------------------------------------------------------------------------------- /blog/site/howto.md: -------------------------------------------------------------------------------- 1 | # Add a jupyter notebook as a blog 2 | 3 | 0. Install [aiml-utils](https://github.com/NCAR/aiml-utils/tree/master/aimlutils), or if already installed do a `git pull` with master before proceeding. Then switch to your branch (or create a new branch). 4 | 5 | 1. Write your notebook, which we will assume is called `test_blog.ipynb`, with comments in Markdown. Be sure to check the header number of the most recently published blog, and set yours to be +1 that number (for example, the very first line in the callbacks blog has the title: 3. Callbacks: Utilities for interacting with ML training). If your blog comes next, make sure the first line is (in Markdown): #4. Your blogs title. 6 | 7 | 2. Save your notebook in the directory `blog/site`. If any additional data or files accompany your blog, save those details in a separate file in `blog/site`. 8 | 9 | 3. Add your blog as the last entry to the registry `blog/site/_toc.yml`. For example: `- file: test_blog.ipynb` 10 | 11 | 4. Rebuild the blog website. First change directory to "aiml-utils/blog", then type: jupyter-book build site (you must have juypter-book, as well as ghp-import installed via pip). Once the site is rebuilt, it will supply you with a local address to view the latest (local) build of the website. When you are happy with your entry, commit the blog to your branch of aiml-utils and issue a pull request. 12 | 13 | 5. When the pull request is approved and merged, publish the blog by first changing to the aiml-utils/blog directory. Next, execute the following command (which will ask for your github username and password): `ghp-import -n -p -f site/_build/html` 14 | 15 | 6. Check the [updated website](https://ncar.github.io/aiml-utils/home.html) for any mistakes or errors. 16 | 17 | Please direct any questions or comments to John Schreck (schreck@ucar.edu) or David John Gagne (dgagne@ucar.edu). 18 | 19 | # License 20 | This work is licensed under a GNU General Public License v3.0 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Setup file for the necessary packages to build and install aimlutils. 3 | For ease, this can be installed from BitBucket via: 4 | pip3 install git+https://[USER_NAME]@https://github.com/NCAR/aiml-utils/aiml-utils.git 5 | 6 | It is recommeneded that you install this into a Python or Conda Virtual Environment. 7 | ''' 8 | 9 | import codecs 10 | import os 11 | import re 12 | from setuptools import setup, find_packages 13 | 14 | here = os.path.abspath(os.path.dirname(__file__)) 15 | 16 | def read(*parts): 17 | with codecs.open(os.path.join(here, *parts), 'r') as fp: 18 | return fp.read() 19 | 20 | def find_version(*file_paths): 21 | version_file = read(*file_paths) 22 | version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", 23 | version_file, re.M) 24 | if version_match: 25 | return version_match.group(1) 26 | raise RuntimeError("Unable to find version string.") 27 | 28 | 29 | 30 | with open("README.md") as f: 31 | long_description = f.read() 32 | 33 | 34 | with open("requirements.txt") as f: 35 | required_libraries = f.read().splitlines() 36 | 37 | 38 | setup( 39 | name='aimlutils', 40 | version=find_version("./", "__version__.py"), 41 | author='AIML', 42 | description=('This repository contains various pieces of code that is shared across different AIML projects, as well as notebooks for blogs'), 43 | long_description=long_description, 44 | long_description_content_type='text/markdown', 45 | url='https://github.com/NCAR/aiml-utils', 46 | classifiers=[ 47 | "Intended Audience :: Developers", 48 | "Natural Language :: English", 49 | "Programming Language :: Python :: 3.7", 50 | ], 51 | keywords="", 52 | install_requires=required_libraries, 53 | packages=find_packages(exclude=['aimlutils/tests']), 54 | # test_suite='tests', 55 | zip_safe=False, 56 | ) 57 | -------------------------------------------------------------------------------- /aimlutils/echo/examples/torch/hyperparameter.yml: -------------------------------------------------------------------------------- 1 | log: 2 | save_path: "examples/torch/test/log.txt" 3 | 4 | slurm: 5 | jobs: 1 6 | batch: 7 | account: "NAML0001" 8 | gres: "gpu:v100:1" 9 | mem: "128G" 10 | n: 8 11 | t: "12:00:00" 12 | J: "hyper_opt" 13 | o: "hyper_opt.out" 14 | e: "hyper_opt.err" 15 | 16 | optuna: 17 | name: "holodec_optimization.db" 18 | reload: 0 19 | objective: "/glade/work/schreck/repos/aiml-utils/aimlutils/hyper_opt/examples/torch/objective.py" 20 | direction: "minimize" 21 | metric: "val_loss" 22 | n_trials: 1 23 | gpu: True 24 | save_path: 'examples/torch/test' 25 | sampler: 26 | type: "TPESampler" 27 | parameters: 28 | num_dense: 29 | type: "int" 30 | settings: 31 | name: "num_dense" 32 | low: 0 33 | high: 10 34 | dense_hidden_dim1: 35 | type: "int" 36 | settings: 37 | name: "dense_hidden_dim1" 38 | low: 10 39 | high: 10000 40 | dense_hidden_dim2: 41 | type: "int" 42 | settings: 43 | name: "dense_hidden_dim2" 44 | low: 10 45 | high: 5000 46 | dr1: 47 | type: "float" 48 | settings: 49 | name: "dr1" 50 | low: 0.0 51 | high: 0.5 52 | dr2: 53 | type: "float" 54 | settings: 55 | name: "dr1" 56 | low: 0.0 57 | high: 0.5 58 | trainer:alpha: 59 | type: "float" 60 | settings: 61 | name: "alpha" 62 | low: 0.001 63 | high: 1.0 64 | trainer:beta: 65 | type: "float" 66 | settings: 67 | name: "beta" 68 | low: 0.001 69 | high: 1.0 70 | optimizer:lr: 71 | type: "loguniform" 72 | settings: 73 | name: "lr" 74 | low: 0.0000001 75 | high: 0.01 76 | optimizer:weight_decay: 77 | type: "loguniform" 78 | settings: 79 | name: "weight_decay" 80 | low: 0.00000001 81 | high: 0.1 -------------------------------------------------------------------------------- /aimlutils/echo/examples/keras/hyperparameter.yml: -------------------------------------------------------------------------------- 1 | log: 2 | save_path: "examples/keras/results/log.txt" 3 | 4 | slurm: 5 | jobs: 1 6 | batch: 7 | account: "NAML0001" 8 | gres: "gpu:v100:1" 9 | mem: "128G" 10 | n: 8 11 | t: "12:00:00" 12 | J: "hyper_opt" 13 | o: "hyper_opt.out" 14 | e: "hyper_opt.err" 15 | 16 | optuna: 17 | name: "holodec_optimization.db" 18 | reload: 1 19 | objective: "examples/keras/objective.py" 20 | direction: "minimize" 21 | metric: "val_loss" 22 | n_trials: 500 23 | gpu: True 24 | save_path: 'examples/keras/results' 25 | sampler: 26 | type: "TPESampler" 27 | parameters: 28 | conv2d_network:lr: 29 | type: "loguniform" 30 | settings: 31 | name: "lr" 32 | low: 0.0000001 33 | high: 0.01 34 | filter1: 35 | type: "int" 36 | settings: 37 | name: "filter1" 38 | low: 1 39 | high: 64 40 | filter2: 41 | type: "int" 42 | settings: 43 | name: "filter2" 44 | low: 1 45 | high: 64 46 | filter3: 47 | type: "int" 48 | settings: 49 | name: "filter3" 50 | low: 1 51 | high: 64 52 | kernel1: 53 | type: "int" 54 | settings: 55 | name: "kernel1" 56 | low: 1 57 | high: 10 58 | kernel2: 59 | type: "int" 60 | settings: 61 | name: "kernel2" 62 | low: 1 63 | high: 10 64 | kernel3: 65 | type: "int" 66 | settings: 67 | name: "kernel3" 68 | low: 1 69 | high: 10 70 | pool1: 71 | type: "int" 72 | settings: 73 | name: "pool1" 74 | low: 1 75 | high: 50 76 | pool2: 77 | type: "int" 78 | settings: 79 | name: "pool1" 80 | low: 1 81 | high: 50 82 | pool3: 83 | type: "int" 84 | settings: 85 | name: "pool1" 86 | low: 1 87 | high: 50 88 | dense1: 89 | type: "int" 90 | settings: 91 | name: "dense1" 92 | low: 10 93 | high: 10000 94 | dense2: 95 | type: "int" 96 | settings: 97 | name: "dense2" 98 | low: 10 99 | high: 5000 100 | -------------------------------------------------------------------------------- /aimlutils/torch/losses/losses.py: -------------------------------------------------------------------------------- 1 | import torch, logging 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | ################### 9 | # 10 | # Entropy Losses 11 | # 12 | ################### 13 | 14 | def loss_fn(recon_x, x, mu, logvar): 15 | criterion = nn.BCELoss(reduction='sum') 16 | BCE = criterion(recon_x, x) 17 | KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) 18 | return BCE + KLD, BCE, KLD 19 | 20 | 21 | class SymmetricCE: 22 | 23 | def __init__(self, alpha, gamma, kld_weight = 1.0): 24 | self.alpha = alpha 25 | self.gamma = alpha 26 | self.kld_weight = kld_weight 27 | 28 | logger.info(f"Loaded Symmetric Cross Entropy loss ...") 29 | logger.info(f"... with alpha = {alpha}, gamma = {gamma}, and kld_weight = {kld_weight}") 30 | 31 | def __call__(self, recon_x, x, mu, logvar): 32 | criterion = nn.BCELoss(reduction='sum') 33 | BCE = criterion(recon_x, x) 34 | #KLD = torch.mean(-0.5 * torch.sum(1 + logvar - mu ** 2 - logvar.exp(), dim = 1), dim = 0) 35 | KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) 36 | return self.alpha * BCE + self.kld_weight * self.gamma * KLD, BCE, KLD 37 | 38 | class SymmetricMSE: 39 | 40 | def __init__(self, alpha, gamma, kld_weight = 1.0): 41 | self.alpha = alpha 42 | self.gamma = gamma 43 | self.kld_weight = kld_weight 44 | 45 | logger.info(f"Loaded Symmetric MSE loss ...") 46 | logger.info(f"... with alpha = {alpha}, gamma = {gamma}, and kld_weight = {kld_weight}") 47 | 48 | def __call__(self, recon_x, x, mu, logvar): 49 | criterion = nn.MSELoss(reduction='sum') 50 | BCE = criterion(recon_x, x) 51 | #KLD = torch.mean(-0.5 * torch.sum(1 + logvar - mu ** 2 - logvar.exp(), dim = 1), dim = 0) 52 | KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) 53 | return self.alpha * BCE + self.kld_weight * self.gamma * KLD, BCE, KLD 54 | 55 | 56 | ################### 57 | # 58 | # Regression Losses - https://github.com/tuantle/regression-losses-pytorch 59 | # 60 | ################### 61 | 62 | class LogCoshLoss(torch.nn.Module): 63 | def __init__(self): 64 | super().__init__() 65 | 66 | def forward(self, y_t, y_prime_t): 67 | ey_t = y_t - y_prime_t 68 | return torch.mean(torch.log(torch.cosh(ey_t + 1e-12))) 69 | 70 | 71 | class XTanhLoss(torch.nn.Module): 72 | def __init__(self): 73 | super().__init__() 74 | 75 | def forward(self, y_t, y_prime_t): 76 | ey_t = y_t - y_prime_t 77 | return torch.mean(ey_t * torch.tanh(ey_t)) 78 | 79 | 80 | class XSigmoidLoss(torch.nn.Module): 81 | def __init__(self): 82 | super().__init__() 83 | 84 | def forward(self, y_t, y_prime_t): 85 | ey_t = y_t - y_prime_t 86 | return torch.mean(2 * ey_t / (1 + torch.exp(-ey_t)) - ey_t) -------------------------------------------------------------------------------- /blog/site/_config.yml: -------------------------------------------------------------------------------- 1 | ####################################################################################### 2 | # Book settings 3 | title: "Analytics and Integrative Machine Learning" 4 | author: AIML @ NCAR 5 | logo: ../NCAR_UCAR_LIVERY/Logos/Contemporary Logos/NCAR/Dark Logo/NCAR-contemp-logo-blue.png 6 | exclude_patterns : [_build, Thumbs.db, .DS_Store, "**.ipynb_checkpoints"] 7 | 8 | 9 | ####################################################################################### 10 | # Execution settings 11 | execute: 12 | execute_notebooks : off # Whether to execute notebooks at build time. Must be one of ("auto", "force", "cache", "off") 13 | cache : "" # A path to the jupyter cache that will be used to store execution artifacts. Defaults to `_build/.jupyter_cache/` 14 | exclude_patterns : [] # A list of patterns to *skip* in execution (e.g. a notebook that takes a really long time) 15 | timeout : 30 # The maximum time (in seconds) each notebook cell is allowed to run. 16 | run_in_temp : false # If `True`, then a temporary directory will be created and used as the command working directory (cwd), 17 | # otherwise the notebook's parent directory will be the cwd. 18 | allow_errors : false # If `False`, when a code cell raises an error the execution is stopped, otherwise all cells are always run. 19 | stderr_output : show # One of 'show', 'remove', 'remove-warn', 'warn', 'error', 'severe' 20 | 21 | ####################################################################################### 22 | # Parse and render settings 23 | parse: 24 | myst_enable_extensions: # default extensions to enable in the myst parser. See https://myst-parser.readthedocs.io/en/latest/using/syntax-optional.html 25 | # - amsmath 26 | - colon_fence 27 | # - deflist 28 | - dollarmath 29 | # - html_admonition 30 | # - html_image 31 | - linkify 32 | # - replacements 33 | # - smartquotes 34 | - substitution 35 | 36 | myst_url_schemes : [mailto, http, https] # URI schemes that will be recognised as external URLs in Markdown links 37 | 38 | ####################################################################################### 39 | # HTML-specific settings 40 | html: 41 | favicon : "" # A path to a favicon image 42 | use_edit_page_button : false # Whether to add an "edit this page" button to pages. If `true`, repository information in repository: must be filled in 43 | use_repository_button : false # Whether to add a link to your repository button 44 | use_issues_button : false # Whether to add an "open an issue" button 45 | extra_navbar : Powered by Jupyter Book # Will be displayed underneath the left navbar. 46 | extra_footer : "" # Will be displayed underneath the footer. 47 | google_analytics_id : "" # A GA id that can be used to track book views. 48 | home_page_in_navbar : true # Whether to include your home page in the left Navigation Bar 49 | baseurl : "" # The base URL where your book will be hosted. Used for creating image previews and social links. e.g.: https://mypage.com/mybook/ 50 | comments: 51 | hypothesis : false 52 | utterances : false 53 | 54 | ####################################################################################### 55 | # LaTeX-specific settings 56 | latex: 57 | latex_engine : pdflatex # one of 'pdflatex', 'xelatex' (recommended for unicode), 'luatex', 'platex', 'uplatex' 58 | use_jupyterbook_latex : true # use jupyterbook-latex for pdf builds as default 59 | 60 | ####################################################################################### 61 | # Launch button settings 62 | launch_buttons: 63 | notebook_interface : jupyterlab # The interface interactive links will activate ["classic", "jupyterlab"] 64 | binderhub_url : https://mybinder.org # The URL of the BinderHub (e.g., https://mybinder.org) 65 | jupyterhub_url : "" # The URL of the JupyterHub (e.g., https://datahub.berkeley.edu) 66 | thebe : false # Add a thebe button to pages (requires the repository to run on Binder) 67 | colab_url : "" # The URL of Google Colab (https://colab.research.google.com) 68 | 69 | repository: 70 | url : https://github.com/NCAR/aiml-utils # The URL to your book's repository 71 | path_to_book : blog/site # A path to your book's folder, relative to the repository root. 72 | branch : master # Which branch of the repository should be used when creating links -------------------------------------------------------------------------------- /aimlutils/echo/examples/keras/objective.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | 4 | import copy 5 | import optuna 6 | import logging 7 | import traceback 8 | 9 | from aimlutils.echo.src.base_objective import * 10 | from .data_generator import DataGenerator 11 | from model import Conv2DNeuralNetwork 12 | 13 | from holodecml.callbacks import get_callbacks 14 | from aimlutils.echo.src.utils import KerasPruningCallback 15 | 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | def custom_updates(trial, conf): 21 | 22 | # Get list of hyperparameters from the config 23 | hyperparameters = conf["optuna"]["parameters"] 24 | 25 | # Now update some via custom rules 26 | filter1 = trial.suggest_int(**hyperparameters["filter1"]["settings"]) 27 | filter2 = trial.suggest_int(**hyperparameters["filter2"]["settings"]) 28 | filter3 = trial.suggest_int(**hyperparameters["filter3"]["settings"]) 29 | kernel1 = trial.suggest_int(**hyperparameters["kernel1"]["settings"]) 30 | kernel2 = trial.suggest_int(**hyperparameters["kernel2"]["settings"]) 31 | kernel3 = trial.suggest_int(**hyperparameters["kernel3"]["settings"]) 32 | pool1 = trial.suggest_int(**hyperparameters["pool1"]["settings"]) 33 | pool2 = trial.suggest_int(**hyperparameters["pool2"]["settings"]) 34 | pool3 = trial.suggest_int(**hyperparameters["pool3"]["settings"]) 35 | dense1 = trial.suggest_int(**hyperparameters["dense1"]["settings"]) 36 | dense2 = trial.suggest_int(**hyperparameters["dense2"]["settings"]) 37 | 38 | conf["conv2d_network"]["filters"] = [filter1, filter2, filter3] 39 | conf["conv2d_network"]["kernel_sizes"] = [kernel1, kernel2, kernel3] 40 | conf["conv2d_network"]["pool_sizes"] = [pool1, pool2, pool3] 41 | conf["conv2d_network"]["dense_sizes"] = [dense1, dense2] 42 | 43 | return conf 44 | 45 | 46 | class Objective(BaseObjective): 47 | 48 | def __init__(self, study, config, metric = "val_loss", device = "cpu"): 49 | 50 | # Initialize the base class 51 | BaseObjective.__init__(self, study, config, metric, device) 52 | 53 | 54 | def train(self, trial, conf): 55 | 56 | # Custom updates 57 | conf = custom_updates(trial, conf) 58 | 59 | # Set up some globals 60 | path_data = conf["path_data"] 61 | num_particles = conf["num_particles"] 62 | split = 'train' 63 | subset = False 64 | output_cols = ["x", "y", "z", "d", "hid"] 65 | 66 | input_shape = (600, 400, 1) 67 | batch_size = conf["conv2d_network"]["batch_size"] 68 | n_particles = conf["num_particles"] 69 | output_channels = len(output_cols) - 1 70 | 71 | # Load the data 72 | train_gen = DataGenerator( 73 | path_data, num_particles, "train", subset, 74 | output_cols, batch_size, maxnum_particles = 3, shuffle = False 75 | ) 76 | train_scalers = train_gen.get_transform() 77 | valid_gen = DataGenerator( 78 | path_data, num_particles, "test", subset, 79 | output_cols, batch_size, scaler = train_scalers, maxnum_particles = 3, shuffle = False 80 | ) 81 | 82 | # Load the model 83 | model = Conv2DNeuralNetwork(**conf["conv2d_network"]) 84 | model.build_neural_network(input_shape, n_particles, output_channels) 85 | 86 | # Load callbacks 87 | callbacks = get_callbacks(conf["callbacks"]) 88 | 89 | # Load optuna keras pruning callback 90 | pruning_callback = KerasPruningCallback(trial, self.metric) 91 | callbacks.append(pruning_callback) 92 | 93 | # Train a model 94 | try: # Aim to catch instances when the GPU memory overflows 95 | blackbox = model.model.fit( 96 | train_gen, 97 | validation_data=valid_gen, 98 | epochs=conf["conv2d_network"]["epochs"], 99 | verbose=True, 100 | callbacks=callbacks, 101 | use_multiprocessing=True, 102 | workers=8, 103 | max_queue_size=100 104 | ) 105 | except: # When that happens, let optuna consider it as a pruned trial 106 | raise optuna.TrialPruned() 107 | 108 | if trial.should_prune(): 109 | raise optuna.TrialPruned() 110 | 111 | # Return the validation accuracy for the last epoch. 112 | objective = blackbox.history[self.metric][-1] 113 | 114 | results_dictionary = { 115 | self.metric: objective 116 | } 117 | 118 | return results_dictionary 119 | -------------------------------------------------------------------------------- /aimlutils/echo/src/base_objective.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | 4 | from aimlutils.echo.src.trial_suggest import trial_suggest_loader 5 | from collections import defaultdict 6 | import copy, os, sys, random 7 | import pandas as pd 8 | import logging 9 | import optuna 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def recursive_update(nested_keys, dictionary, update): 16 | if isinstance(dictionary, dict) and len(nested_keys) > 1: 17 | recursive_update(nested_keys[1:], dictionary[nested_keys[0]], update) 18 | else: 19 | dictionary[nested_keys[0]] = update 20 | 21 | 22 | class BaseObjective: 23 | 24 | def __init__(self, config, metric = "val_loss", device = "cpu"): 25 | 26 | self.config = config 27 | self.metric = metric 28 | self.device = f"cuda:{device}" if device != "cpu" else "cpu" 29 | 30 | self.results = defaultdict(list) 31 | save_path = config["optuna"]["save_path"] 32 | self.results_fn = os.path.join(save_path, f"hyper_opt_{random.randint(0, 1e5)}.csv") 33 | while os.path.isfile(self.results_fn): 34 | rand_index = random.randint(0, 1e5) 35 | self.results_fn = os.path.join(save_path, f"hyper_opt_{rand_index}.csv") 36 | 37 | logger.info(f"Initialized an objective to be optimized with metric {metric}") 38 | logger.info(f"Using device {device}") 39 | logger.info(f"Saving study/trial results to local file {self.results_fn}") 40 | 41 | def update_config(self, trial): 42 | 43 | logger.info( 44 | f"Attempting to automatically update the model configuration using optuna's suggested parameters" 45 | ) 46 | 47 | # Make a copy the config that we can edit 48 | conf = copy.deepcopy(self.config) 49 | 50 | # Update the fields that can be matched automatically (through the name field) 51 | updated = [] 52 | hyperparameters = conf["optuna"]["parameters"] 53 | for named_parameter, update in hyperparameters.items(): 54 | if ":" in named_parameter: 55 | recursive_update( 56 | named_parameter.split(":"), 57 | conf, 58 | trial_suggest_loader(trial, update)) 59 | updated.append(named_parameter) 60 | else: 61 | if named_parameter in conf: 62 | conf[named_parameter] = trial_suggest_loader(trial, update) 63 | updated.append(named_parameter) 64 | 65 | logger.info(f"Those that got updated automatically: {updated}") 66 | return conf 67 | 68 | # #Deprecated as of writing of report.py script 69 | 70 | def save(self, trial, results_dict): 71 | 72 | # Make sure the relevant metric was placed into the results dictionary 73 | single_objective = isinstance(self.metric, str) 74 | if single_objective: 75 | if self.metric not in results_dict: 76 | raise OSError( 77 | "You must return the metric result to the hyperparameter optimizer" 78 | ) 79 | else: 80 | for metric in self.metric: 81 | if metric not in results_dict: 82 | raise OSError( 83 | "You must return the metric result to the hyperparameter optimizer" 84 | ) 85 | 86 | # Save the hyperparameters used in the trial 87 | self.results["trial"].append(trial.number) 88 | for param, value in trial.params.items(): 89 | self.results[param].append(value) 90 | 91 | # Save the metric and "other metrics" 92 | for metric, value in results_dict.items(): 93 | self.results[metric].append(value) 94 | 95 | # Save pruning boolean 96 | self.results["pruned"] = int(trial.should_prune()) 97 | #self.results["complete"] = int(trial.state == optuna.trial.TrialState.COMPLETE) 98 | 99 | # Save the df of results to disk 100 | pd.DataFrame.from_dict(self.results).to_csv(self.results_fn) 101 | 102 | logger.info( 103 | f"Saving trial {trial.number} results to local file {self.results_fn}" 104 | ) 105 | 106 | if single_objective: 107 | return results_dict[self.metric] 108 | else: 109 | return [result[metric] for metric in self.metric] 110 | 111 | def __call__(self, trial): 112 | 113 | # Automatically update the config, when possible 114 | conf = self.update_config(trial) 115 | 116 | # Train the model 117 | logger.info( 118 | f"Beginning to train the model using the latest parameters from optuna" 119 | ) 120 | 121 | result = self.train(trial, conf) 122 | 123 | return self.save(trial, result) 124 | 125 | def train(self, trial, conf): 126 | raise NotImplementedError -------------------------------------------------------------------------------- /aimlutils/torch/checkpoint/checkpointer.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from typing import List, Dict 3 | import pandas as pd 4 | import numpy as np 5 | import logging 6 | import torch 7 | import time 8 | import math 9 | import glob 10 | import os 11 | 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def load_checkpoint(checkpoint_path: str): 17 | # It's weird that if `map_location` is not given, it will be extremely slow. 18 | return torch.load(checkpoint_path, map_location=lambda storage, loc: storage) 19 | 20 | 21 | class EarlyStopping: 22 | """Early stops the training if validation loss doesn't improve after a given patience.""" 23 | 24 | def __init__(self, 25 | patience=7, 26 | verbose=False, 27 | delta=0, 28 | save_every_epoch=False, 29 | path_save='checkpoint.pt', 30 | tag = None): 31 | """ 32 | Args: 33 | patience (int): How long to wait after last time validation loss improved. 34 | Default: 7 35 | verbose (bool): If True, prints a message for each validation loss improvement. 36 | Default: False 37 | delta (float): Minimum change in the monitored quantity to qualify as an improvement. 38 | Default: 0 39 | path (str): Path for the checkpoint to be saved to. 40 | Default: 'checkpoint.pt' 41 | trace_func (function): trace print function. 42 | Default: print 43 | """ 44 | self.patience = patience 45 | self.verbose = verbose 46 | self.counter = 0 47 | self.best_score = None 48 | self.early_stop = False 49 | self.val_loss_min = np.Inf 50 | self.delta = delta 51 | self.path = path_save 52 | self.dirpath = os.path.dirname(self.path) 53 | self.save_every_epoch = save_every_epoch 54 | self.tag = tag 55 | 56 | logger.info( 57 | f"Loaded EarlyStopping checkpointer with patience {self.patience}") 58 | 59 | def __call__(self, epoch, val_loss, model, optimizer): 60 | 61 | score = val_loss 62 | 63 | if self.best_score is None: 64 | self.best_score = score 65 | self.save_checkpoint(epoch, val_loss, model, optimizer, best=True) 66 | elif score < (self.best_score + self.delta): 67 | self.best_score = score 68 | self.save_checkpoint(epoch, val_loss, model, optimizer, best=True) 69 | self.counter = 0 70 | else: 71 | self.counter += 1 72 | logger.info( 73 | f'EarlyStopping counter: {self.counter} out of {self.patience}') 74 | if self.save_every_epoch: 75 | self.save_checkpoint( 76 | epoch, val_loss, model, optimizer, best=False) 77 | if self.counter >= self.patience: 78 | self.early_stop = True 79 | 80 | def save_checkpoint(self, epoch, val_loss, model, optimizer, best=False): 81 | '''Saves model when validation loss decrease.''' 82 | if best: 83 | logger.info( 84 | f'Validation loss decreased on epoch {epoch} ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model.' 85 | ) 86 | checkpoint = { 87 | 'epoch': epoch, 88 | 'val_loss': val_loss, 89 | 'model_state_dict': model.state_dict(), 90 | 'optimizer_state_dict': optimizer.state_dict(), 91 | 'lr': self.print_learning_rate(optimizer) 92 | } 93 | if not best: # save a model, not the best one seen so far 94 | if self.tag is not None: 95 | save_path = os.path.join(self.dirpath, f"checkpoint_{self.tag}.pt") 96 | else: 97 | save_path = os.path.join(self.dirpath, "checkpoint.pt") 98 | torch.save(checkpoint, save_path) 99 | else: # save best model so far 100 | if self.tag is not None: 101 | save_path = os.path.join(self.dirpath, f"best_{self.tag}.pt") 102 | else: 103 | save_path = os.path.join(self.dirpath, "best.pt") 104 | torch.save(checkpoint, save_path) 105 | self.val_loss_min = val_loss 106 | 107 | def print_learning_rate(self, optimizer): 108 | for param_group in optimizer.param_groups: 109 | return param_group["lr"] 110 | 111 | 112 | class MetricsLogger: 113 | 114 | def __init__(self, path_save: str, reload: bool = False) -> None: 115 | 116 | self.path_save = os.path.join(f"{path_save}", "training_log.csv") 117 | 118 | if reload: 119 | self.load() 120 | logger.info( 121 | f"Loaded a previous metrics file from {self.path_save}") 122 | else: 123 | self.metrics = defaultdict(list) 124 | logger.info( 125 | f"Loaded a metrics logger {self.path_save} to track the training results") 126 | 127 | def update(self, data: Dict[str, float]) -> None: 128 | for key, value in data.items(): 129 | self.metrics[key].append(value) 130 | self.save() 131 | 132 | def to_pandas(self) -> pd.DataFrame: 133 | return pd.DataFrame.from_dict(self.metrics) 134 | 135 | def save(self) -> None: 136 | self.to_pandas().to_csv( 137 | self.path_save, 138 | sep=',', 139 | encoding='utf-8', 140 | index=None 141 | ) 142 | 143 | def load(self) -> None: 144 | self.metrics = pd.read_csv( 145 | self.path_save, 146 | sep=',', 147 | encoding='utf-8' 148 | ).to_dict() 149 | -------------------------------------------------------------------------------- /aimlutils/echo/examples/keras/data_generator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import yaml 4 | import math 5 | import time 6 | import random 7 | import traceback 8 | import xarray as xr 9 | import numpy as np 10 | import pandas as pd 11 | from datetime import datetime 12 | 13 | import matplotlib.pyplot as plt 14 | import scipy.sparse 15 | from scipy.ndimage import gaussian_filter 16 | 17 | from tqdm.auto import tqdm 18 | 19 | import numpy.fft as FFT 20 | from typing import List, Dict 21 | 22 | from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler 23 | from tensorflow.keras.layers import (Input, Conv2D, Dense, Flatten, 24 | MaxPool2D, RepeatVector, Lambda, 25 | LeakyReLU, Dropout) 26 | from tensorflow.keras.models import Model, save_model 27 | from tensorflow.keras.optimizers import Adam, SGD 28 | import tensorflow.keras.backend as K 29 | 30 | from keras_radam import RAdam 31 | from keras_radam.training import RAdamOptimizer 32 | 33 | import tensorflow as tf 34 | 35 | 36 | num_particles_dict = { 37 | 1: '1particle', 38 | 3: '3particle', 39 | 'multi': 'multiparticle', 40 | '50-100': '50-100'} 41 | 42 | split_dict = { 43 | 'train' : 'training', 44 | 'test' : 'test', 45 | 'valid': 'validation'} 46 | 47 | 48 | class DataGenerator(tf.keras.utils.Sequence): 49 | 'Generates data for Keras' 50 | def __init__( 51 | 52 | self, 53 | path_data: str, 54 | num_particles: int, 55 | split: str, 56 | subset: bool, 57 | output_cols: List[str], 58 | batch_size: int, 59 | shuffle: bool = True, 60 | maxnum_particles: int = False, 61 | scaler: Dict[str, str] = False) -> None: 62 | 63 | 'Initialization' 64 | self.ds = self.open_dataset(path_data, num_particles, split) 65 | self.batch_size = batch_size 66 | self.output_cols = [x for x in output_cols if x != 'hid'] 67 | self.subset = subset 68 | self.hologram_numbers = self.ds.hologram_number.values 69 | if shuffle: 70 | random.shuffle(self.hologram_numbers) 71 | self.num_particles = num_particles 72 | self.xsize = len(self.ds.xsize.values) 73 | self.ysize = len(self.ds.ysize.values) 74 | self.shuffle = shuffle 75 | self.maxnum_particles = maxnum_particles 76 | 77 | if not scaler: 78 | self.scaler = {col: StandardScaler() for col in output_cols} 79 | for col in output_cols: 80 | scale = self.ds[col].values 81 | self.scaler[col].fit(scale.reshape(scale.shape[-1], -1)) 82 | else: 83 | self.scaler = scaler 84 | 85 | def get_transform(self): 86 | return self.scaler 87 | 88 | def __len__(self): 89 | 'Denotes the number of batches per epoch' 90 | return math.ceil(len(self.hologram_numbers) / self.batch_size) 91 | 92 | def __getitem__(self, idx): 93 | 'Generate one batch of data' 94 | holograms = self.hologram_numbers[ 95 | idx * self.batch_size: (idx + 1) * self.batch_size 96 | ] 97 | x_out, y_out, w_out = self._batch(holograms) 98 | return x_out, y_out, w_out 99 | 100 | def on_epoch_end(self): 101 | 'Updates indexes after each epoch' 102 | if self.shuffle == True: 103 | random.shuffle(self.hologram_numbers) 104 | 105 | def _batch(self, holograms: List[int]): 106 | 'Create a batch of data' 107 | try: 108 | 109 | x_out = np.zeros(( 110 | len(holograms), self.xsize, self.ysize 111 | )) 112 | y_out = np.zeros(( 113 | len(holograms), 114 | self.maxnum_particles if self.maxnum_particles else self.num_particles, 115 | len(self.output_cols) 116 | )) 117 | # Move the scaler.transform to here 118 | 119 | a = time.time() 120 | for k, hologram in enumerate(holograms): 121 | im = self.ds["image"][hologram].values 122 | x_out[k] = (im-np.mean(im)) / (np.std(im)) 123 | #A = np.log(np.abs(OpticsFFT(A))) 124 | particles = np.where(self.ds["hid"] == hologram + 1)[0] 125 | for l, p in enumerate(particles): 126 | for m, col in enumerate(self.output_cols): 127 | val = self.ds[col][p].values 128 | y_out[k, l, m] = self.scaler[col].transform( 129 | val.reshape(1, -1) 130 | ) 131 | if self.maxnum_particles and len(particles) < self.maxnum_particles: 132 | for l in range(len(particles), self.maxnum_particles): 133 | for m, col in enumerate(self.output_cols): 134 | val = y_out[k, l, m] 135 | y_out[k, l, m] = self.scaler[col].transform( 136 | val.reshape(1, -1) 137 | ) 138 | # 139 | # convert y_out to sparse if we are using padding 140 | # if self.maxnum_particles: 141 | # y_out = sparse_vstack([ 142 | # csr_matrix(y_out[i]) for i in y_out.shape[0] 143 | # ]) 144 | 145 | x_out = np.expand_dims(x_out, axis=-1) 146 | return x_out, y_out, [None] #class weights option 147 | 148 | except: 149 | print(traceback.print_exc()) 150 | 151 | def open_dataset(self, path_data, num_particles, split): 152 | """ 153 | Opens a HOLODEC file 154 | 155 | Args: 156 | path_data: (str) Path to dataset directory 157 | num_particles: (int or str) Number of particles per hologram 158 | split: (str) Dataset split of either 'train', 'valid', or 'test' 159 | 160 | Returns: 161 | ds: (xarray Dataset) Opened dataset 162 | """ 163 | path_data = os.path.join(path_data, self.dataset_name(num_particles, split)) 164 | 165 | if not os.path.isfile(path_data): 166 | print(f"Data file does not exist at {path_data}. Exiting.") 167 | raise 168 | 169 | ds = xr.open_dataset(path_data) 170 | return ds 171 | 172 | def dataset_name(self, num_particles, split, file_extension='nc'): 173 | """ 174 | Return the dataset filename given user inputs 175 | 176 | Args: 177 | num_particles: (int or str) Number of particles per hologram 178 | split: (str) Dataset split of either 'train', 'valid', or 'test' 179 | file_extension: (str) Dataset file extension 180 | 181 | Returns: 182 | ds_name: (str) Dataset name 183 | """ 184 | 185 | valid = [1,3,'multi','50-100'] 186 | if num_particles not in valid: 187 | raise ValueError("results: num_particles must be one of %r." % valid) 188 | num_particles = num_particles_dict[num_particles] 189 | 190 | valid = ['train','test','valid'] 191 | if split not in valid: 192 | raise ValueError("results: split must be one of %r." % valid) 193 | split = split_dict[split] 194 | ds_name = f'synthetic_holograms_{num_particles}_{split}.{file_extension}' 195 | 196 | return ds_name -------------------------------------------------------------------------------- /aimlutils/torch/trainers/trainers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import yaml 4 | import tqdm 5 | import torch 6 | import pickle 7 | import logging 8 | 9 | from torchvision.utils import save_image 10 | from holodecml.vae.losses import * 11 | 12 | import numpy as np 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class BaseTrainer: 18 | 19 | def __init__(self, 20 | model, 21 | optimizer, 22 | train_gen, 23 | valid_gen, 24 | dataloader, 25 | valid_dataloader, 26 | start_epoch = 0, 27 | epochs = 100, 28 | device = "cpu", 29 | clip = 1.0, 30 | alpha = 1.0, 31 | beta = 1.0, 32 | kld_weight = [], 33 | path_save = "./", 34 | test_image = None): 35 | 36 | self.model = model 37 | self.optimizer = optimizer 38 | self.train_gen = train_gen 39 | self.valid_gen = valid_gen 40 | self.dataloader = dataloader 41 | self.valid_dataloader = valid_dataloader 42 | self.batch_size = dataloader.batch_size 43 | self.path_save = path_save 44 | self.device = device 45 | 46 | self.start_epoch = start_epoch 47 | self.epochs = epochs 48 | 49 | self.alpha = alpha 50 | self.beta = beta 51 | 52 | self.kld_weight = kld_weight 53 | if len(kld_weight) == 0: 54 | self.kld_weight = [ 55 | self.batch_size/self.train_gen.__len__(), 56 | self.batch_size/self.valid_gen.__len__() 57 | ] 58 | self.criterion_train = SymmetricMSE( 59 | self.alpha, self.beta, self.kld_weight[0] 60 | ) 61 | self.criterion_test = SymmetricMSE( 62 | self.alpha, self.beta, self.kld_weight[1] 63 | ) 64 | 65 | self.test_image = test_image 66 | 67 | # Gradient clipping through hook registration 68 | for p in self.model.parameters(): 69 | p.register_hook(lambda grad: torch.clamp(grad, -clip, clip)) 70 | logger.info(f"Clipping gradients to range [-{clip}, {clip}]") 71 | 72 | # Create the save directory if it does not exist 73 | try: 74 | os.makedirs(path_save) 75 | except: 76 | pass 77 | 78 | 79 | def train_one_epoch(self, epoch): 80 | 81 | self.model.train() 82 | batches_per_epoch = int(np.ceil(self.train_gen.__len__() / self.batch_size)) 83 | batch_group_generator = tqdm.tqdm( 84 | enumerate(self.dataloader), 85 | total=batches_per_epoch, 86 | leave=True 87 | ) 88 | 89 | epoch_losses = {"loss": [], "bce": [], "kld": []} 90 | for idx, images in batch_group_generator: 91 | 92 | images = images.to(self.device) 93 | recon_images, mu, logvar = self.model(images) 94 | loss, bce, kld = self.criterion_train(recon_images, images, mu, logvar) 95 | 96 | self.optimizer.zero_grad() 97 | loss.backward() 98 | self.optimizer.step() 99 | 100 | batch_loss = loss.item() #/ self.batch_size 101 | bce_loss = bce.item() #/ self.batch_size 102 | kld_loss = kld.item() #/ self.batch_size 103 | 104 | epoch_losses["loss"].append(batch_loss) 105 | epoch_losses["bce"].append(bce_loss) 106 | epoch_losses["kld"].append(kld_loss) 107 | 108 | loss = np.mean(epoch_losses["loss"]) 109 | bce = np.mean(epoch_losses["bce"]) 110 | kld = np.mean(epoch_losses["kld"]) 111 | 112 | to_print = "loss: {:.3f} bce: {:.3f} kld: {:.3f}".format(loss, bce, kld) 113 | batch_group_generator.set_description(to_print) 114 | batch_group_generator.update() 115 | 116 | return loss, bce, kld 117 | 118 | 119 | def test(self, epoch): 120 | 121 | self.model.eval() 122 | batches_per_epoch = int(np.ceil(self.valid_gen.__len__() / self.batch_size)) 123 | 124 | with torch.no_grad(): 125 | 126 | batch_group_generator = tqdm.tqdm( 127 | enumerate(self.valid_dataloader), 128 | total=batches_per_epoch, 129 | leave=True 130 | ) 131 | 132 | epoch_losses = {"loss": [], "bce": [], "kld": []} 133 | for idx, images in batch_group_generator: 134 | 135 | images = images.to(self.device) 136 | recon_images, mu, logvar = self.model(images) 137 | loss, bce, kld = self.criterion_test(recon_images, images, mu, logvar) 138 | 139 | batch_loss = loss.item() #/ self.batch_size 140 | bce_loss = bce.item() #/ self.batch_size 141 | kld_loss = kld.item() #/ self.batch_size 142 | 143 | epoch_losses["loss"].append(batch_loss) 144 | epoch_losses["bce"].append(bce_loss) 145 | epoch_losses["kld"].append(kld_loss) 146 | 147 | loss = np.mean(epoch_losses["loss"]) 148 | bce = np.mean(epoch_losses["bce"]) 149 | kld = np.mean(epoch_losses["kld"]) 150 | 151 | to_print = "val_loss: {:.3f} val_bce: {:.3f} val_kld: {:.3f}".format(loss, bce, kld) 152 | batch_group_generator.set_description(to_print) 153 | batch_group_generator.update() 154 | 155 | if os.path.isfile(self.test_image): 156 | with open(self.test_image, "rb") as fid: 157 | pic = pickle.load(fid) 158 | self.compare(epoch, pic) 159 | 160 | return loss, bce, kld 161 | 162 | 163 | def compare(self, epoch, x): 164 | x = x.to(self.device) 165 | recon_x, _, _ = self.model(x) 166 | compare_x = torch.cat([x, recon_x]) 167 | save_image(compare_x.data.cpu(), f'{self.path_save}/image_epoch_{epoch}.png') 168 | 169 | 170 | def train(self, 171 | scheduler, 172 | early_stopping, 173 | metrics_logger): 174 | 175 | logger.info( 176 | f"Training the model for up to {self.epochs} epochs starting at epoch {self.start_epoch}" 177 | ) 178 | 179 | flag = isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau) 180 | 181 | for epoch in range(self.start_epoch, self.epochs): 182 | 183 | train_loss, train_bce, train_kld = self.train_one_epoch(epoch) 184 | test_loss, test_bce, test_kld = self.test(epoch) 185 | 186 | scheduler.step(test_loss if flag else epoch) 187 | early_stopping(epoch, test_loss, self.model, self.optimizer) 188 | 189 | # Write results to the callback logger 190 | result = { 191 | "epoch": epoch, 192 | "train_loss": train_loss, 193 | "train_bce": train_bce, 194 | "train_kld": train_kld, 195 | "valid_loss": test_loss, 196 | "valid_bce": test_bce, 197 | "valid_kld": test_kld, 198 | "lr": early_stopping.print_learning_rate(self.optimizer) 199 | } 200 | metrics_logger.update(result) 201 | 202 | if early_stopping.early_stop: 203 | logger.info("Early stopping") 204 | break 205 | -------------------------------------------------------------------------------- /aimlutils/echo/examples/torch/objective.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | 4 | import copy 5 | import optuna 6 | import logging 7 | import traceback 8 | 9 | from overrides import overrides 10 | from holodecml.vae.losses import * 11 | from holodecml.vae.visual import * 12 | from holodecml.vae.models import * 13 | from holodecml.vae.trainers import * 14 | from holodecml.vae.transforms import * 15 | from holodecml.vae.optimizers import * 16 | from holodecml.vae.data_loader import * 17 | from holodecml.vae.checkpointer import * 18 | from aimlutils.hyper_opt.base_objective import * 19 | 20 | from torch import nn 21 | from torch.optim.lr_scheduler import * 22 | from torch.utils.data import Dataset, DataLoader 23 | from typing import List, Dict, Callable, Union, Any, TypeVar, Tuple 24 | 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | 29 | def custom_updates(trial, conf): 30 | 31 | # Get list of hyperparameters from the config 32 | hyperparameters = conf["optuna"]["parameters"] 33 | 34 | # Now update some via custom rules 35 | num_dense = trial_suggest_loader(trial, hyperparameters["num_dense"]) 36 | dense1 = trial_suggest_loader(trial, hyperparameters['dense_hidden_dim1']) 37 | dense2 = trial_suggest_loader(trial, hyperparameters['dense_hidden_dim2']) 38 | dr1 = trial_suggest_loader(trial, hyperparameters['dr1']) 39 | dr2 = trial_suggest_loader(trial, hyperparameters['dr2']) 40 | 41 | # Update the config based on optuna suggestions 42 | conf["model"]["dense_hidden_dims"] = [dense1] + [dense2 for k in range(num_dense)] 43 | conf["model"]["dense_dropouts"] = [dr1] + [dr2 for k in range(num_dense)] 44 | return conf 45 | 46 | 47 | class Objective(BaseObjective): 48 | 49 | def __init__(self, study, config, metric = "val_loss", device = "cpu"): 50 | 51 | BaseObjective.__init__(self, study, config, metric, device) 52 | 53 | if self.device != "cpu": 54 | torch.backends.cudnn.benchmark = True 55 | 56 | 57 | def train(self, trial, conf): 58 | 59 | ########################################################### 60 | # 61 | # Implement custom changes to config 62 | # 63 | ########################################################### 64 | 65 | conf = custom_updates(trial, conf) 66 | 67 | ########################################################### 68 | # 69 | # Load ML pipeline, train the model, and return the result 70 | # 71 | ########################################################### 72 | 73 | # Load custom option for the VAE/compressor models 74 | model_type = conf["type"] 75 | 76 | # Load image transformations. 77 | transform = LoadTransformations(conf["transforms"], device = self.device) 78 | 79 | # Load dataset readers 80 | train_gen = LoadReader( 81 | reader_type = model_type, 82 | split = "train", 83 | transform = transform, 84 | scaler = None, 85 | config = conf["data"] 86 | ) 87 | 88 | valid_gen = LoadReader( 89 | reader_type = model_type, 90 | split = "test", 91 | transform = transform, 92 | scaler = train_gen.get_transform(), 93 | config = conf["data"], 94 | ) 95 | 96 | # Load data iterators from pytorch 97 | n_workers = conf['iterator']['num_workers'] 98 | 99 | #logging.info(f"Loading training data iterator using {n_workers} workers") 100 | 101 | dataloader = DataLoader( 102 | train_gen, 103 | **conf["iterator"] 104 | ) 105 | 106 | valid_dataloader = DataLoader( 107 | valid_gen, 108 | **conf["iterator"] 109 | ) 110 | 111 | # Load the model 112 | model = LoadModel(model_type, conf["model"], self.device) 113 | 114 | # Load the optimizer 115 | optimizer_config = conf["optimizer"] 116 | optimizer = LoadOptimizer( 117 | optimizer_config["type"], 118 | model.parameters(), 119 | optimizer_config["lr"], 120 | optimizer_config["weight_decay"] 121 | ) 122 | 123 | # Load the trainer 124 | trainer = CustomTrainer( 125 | model = model, 126 | optimizer = optimizer, 127 | train_gen = train_gen, 128 | valid_gen = valid_gen, 129 | dataloader = dataloader, 130 | valid_dataloader = valid_dataloader, 131 | device = self.device, 132 | **conf["trainer"] 133 | ) 134 | 135 | # Initialize LR annealing scheduler 136 | if "ReduceLROnPlateau" in conf["callbacks"]: 137 | schedule_config = conf["callbacks"]["ReduceLROnPlateau"] 138 | scheduler = ReduceLROnPlateau(trainer.optimizer, **schedule_config) 139 | logging.info( 140 | f"Loaded ReduceLROnPlateau learning rate annealer with patience {schedule_config['patience']}" 141 | ) 142 | elif "ExponentialLR" in conf["callbacks"]: 143 | schedule_config = conf["callbacks"]["ExponentialLR"] 144 | scheduler = ExponentialLR(trainer.optimizer, **schedule_config) 145 | logging.info( 146 | f"Loaded ExponentialLR learning rate annealer with reduce factor {schedule_config['gamma']}" 147 | ) 148 | 149 | # Initialize early stopping 150 | checkpoint_config = conf["callbacks"]["EarlyStopping"] 151 | early_stopping = EarlyStopping(**checkpoint_config) 152 | 153 | # Train the model 154 | val_loss, val_mse, val_bce, val_acc = trainer.train( 155 | trial, scheduler, early_stopping, self.metric 156 | ) 157 | 158 | results = { 159 | "val_loss": val_loss, 160 | "val_mse": val_mse, 161 | "val_bce": val_bce, 162 | "val_acc": val_acc 163 | } 164 | 165 | return self.save(trial, results) 166 | 167 | 168 | class CustomTrainer(BaseEncoderTrainer): 169 | 170 | def train(self, 171 | trial, 172 | scheduler, 173 | early_stopping, 174 | metric = "val_loss"): 175 | 176 | flag = isinstance( 177 | scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau) 178 | 179 | for epoch in range(self.start_epoch, self.epochs): 180 | 181 | try: 182 | train_loss, train_mse, train_bce, train_accuracy = self.train_one_epoch(epoch) 183 | test_loss, test_mse, test_bce, test_accuracy = self.test(epoch) 184 | 185 | if "val_loss" in metric: 186 | metric_val = test_loss 187 | elif "val_mse_loss" in metric: 188 | metric_val = test_mse 189 | elif "val_bce_loss" in metric: 190 | metric_val = test_bce 191 | elif "val_acc" in metric: 192 | metric_val = -test_accuracy 193 | else: 194 | supported = "val_loss, val_mse_loss, val_bce_loss, val_acc" 195 | raise ValueError(f"The metric {metric} is not supported. Choose from {supported}") 196 | 197 | trial.report(-metric_val, step=epoch+1) 198 | scheduler.step(metric_val if flag else epoch) 199 | early_stopping(epoch, metric_val, self.model, self.optimizer) 200 | 201 | except Exception as E: # CUDA memory overflow 202 | print(traceback.print_exc()) 203 | raise optuna.TrialPruned() 204 | 205 | if trial.should_prune(): 206 | raise optuna.TrialPruned() 207 | 208 | if early_stopping.early_stop: 209 | break 210 | 211 | return test_loss, test_mse, test_bce, test_accuracy -------------------------------------------------------------------------------- /aimlutils/echo/examples/keras/model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import yaml 3 | import math 4 | import time 5 | import random 6 | import traceback 7 | import xarray as xr 8 | import numpy as np 9 | import pandas as pd 10 | from datetime import datetime 11 | 12 | import matplotlib.pyplot as plt 13 | import scipy.sparse 14 | from scipy.ndimage import gaussian_filter 15 | 16 | from tqdm.auto import tqdm 17 | 18 | import numpy.fft as FFT 19 | from typing import List, Dict 20 | 21 | from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler 22 | from tensorflow.keras.layers import (Input, Conv2D, Dense, Flatten, 23 | MaxPool2D, RepeatVector, Lambda, 24 | LeakyReLU, Dropout) 25 | from tensorflow.keras.models import Model, save_model 26 | from tensorflow.keras.optimizers import Adam, SGD 27 | import tensorflow.keras.backend as K 28 | 29 | from keras_radam import RAdam 30 | from keras_radam.training import RAdamOptimizer 31 | 32 | 33 | 34 | class Conv2DNeuralNetwork(object): 35 | """ 36 | A Conv2D Neural Network Model that can support an arbitrary numbers of 37 | layers. 38 | 39 | Attributes: 40 | filters: List of number of filters in each Conv2D layer 41 | kernel_sizes: List of kernel sizes in each Conv2D layer 42 | conv2d_activation: Type of activation function for conv2d layers 43 | pool_sizes: List of Max Pool sizes 44 | dense_sizes: Sizes of dense layers 45 | dense_activation: Type of activation function for dense layers 46 | output_activation: Type of activation function for output layer 47 | lr: Optimizer learning rate 48 | optimizer: Name of optimizer or optimizer object. 49 | adam_beta_1: Exponential decay rate for the first moment estimates 50 | adam_beta_2: Exponential decay rate for the first moment estimates 51 | sgd_momentum: Stochastic Gradient Descent momentum 52 | decay: Optimizer decay 53 | loss: Name of loss function or loss object 54 | batch_size: Number of examples per batch 55 | epochs: Number of epochs to train 56 | verbose: Level of detail to provide during training 57 | model: Keras Model object 58 | """ 59 | def __init__( 60 | self, 61 | filters=(8,), 62 | kernel_sizes=(5,), 63 | conv2d_activation="relu", 64 | pool_sizes=(4,), 65 | pool_dropout=0.0, 66 | dense_sizes=(64,), 67 | dense_activation="relu", 68 | dense_dropout = 0.0, 69 | output_activation="linear", 70 | lr=0.001, 71 | optimizer="adam", 72 | adam_beta_1=0.9, 73 | adam_beta_2=0.999, 74 | sgd_momentum=0.9, 75 | decay=0, 76 | loss="mse", 77 | metrics = [], 78 | batch_size=32, 79 | epochs=2, 80 | verbose=0 81 | ): 82 | 83 | self.filters = filters 84 | self.kernel_sizes = [tuple((v,v)) for v in kernel_sizes] 85 | self.conv2d_activation = conv2d_activation 86 | self.pool_sizes = [tuple((v,v)) for v in pool_sizes] 87 | self.pool_dropout = pool_dropout 88 | self.dense_sizes = dense_sizes 89 | self.dense_activation = dense_activation 90 | self.dense_dropout = dense_dropout 91 | self.output_activation = output_activation 92 | self.lr = lr 93 | self.optimizer = optimizer 94 | self.optimizer_obj = None 95 | self.adam_beta_1 = adam_beta_1 96 | self.adam_beta_2 = adam_beta_2 97 | self.sgd_momentum = sgd_momentum 98 | self.decay = decay 99 | self.loss = loss 100 | self.metrics = metrics 101 | self.batch_size = batch_size 102 | self.epochs = epochs 103 | self.verbose = verbose 104 | self.model = None 105 | 106 | if self.conv2d_activation == "leakyrelu": 107 | self.conv2d_activation = LeakyReLU(alpha=0.1) 108 | if self.dense_activation == "leakyrelu": 109 | self.dense_activation = LeakyReLU(alpha=0.1) 110 | if self.output_activation == "leakyrelu": 111 | self.output_activation = LeakyReLU(alpha=0.1) 112 | 113 | def build_neural_network(self, input_shape, n_particles, output_shape): 114 | """Create Keras neural network model and compile it.""" 115 | 116 | # Input 117 | conv_input = Input(shape=(input_shape), name="input") 118 | 119 | # ConvNet encoder 120 | nn_model = conv_input 121 | for h in range(len(self.filters)): 122 | nn_model = Conv2D(self.filters[h], 123 | self.kernel_sizes[h], 124 | padding="same", 125 | activation=self.conv2d_activation, 126 | kernel_initializer='he_uniform', 127 | name=f"conv2D_{h:02d}")(nn_model) 128 | nn_model = MaxPool2D(self.pool_sizes[h], padding='same', 129 | name=f"maxpool2D_{h:02d}")(nn_model) 130 | if self.pool_dropout > 0.0: 131 | nn_model = Dropout(self.pool_dropout, 132 | name = f"maxpool2D_dr_{h:02d}")(nn_model) 133 | nn_model = Flatten()(nn_model) 134 | 135 | # Classifier 136 | for h in range(len(self.dense_sizes)): 137 | nn_model = Dense(self.dense_sizes[h], 138 | activation=self.dense_activation, 139 | kernel_initializer='he_uniform', 140 | name=f"dense_{h:02d}")(nn_model) 141 | if self.dense_dropout > 0.0: 142 | nn_model = Dropout(self.dense_dropout, 143 | name=f"dense_dr_{h:02d}")(nn_model) 144 | 145 | # Output 146 | nn_model = RepeatVector(n_particles, name = "repeat")(nn_model) 147 | nn_model = Dense(output_shape, 148 | activation=self.output_activation, 149 | name=f"dense_output")(nn_model) 150 | nn_model = Lambda( 151 | self.LastLayer, 152 | input_shape = (n_particles, output_shape) 153 | )(nn_model) 154 | 155 | self.model = Model(conv_input, nn_model) 156 | 157 | if self.optimizer == "adam": 158 | self.optimizer_obj = Adam(lr=self.lr, clipnorm = 1.0) 159 | elif self.optimizer == "sgd": 160 | self.optimizer_obj = SGD(lr=self.lr, momentum=self.sgd_momentum, 161 | decay=self.decay) 162 | 163 | self.model.compile( 164 | optimizer=self.optimizer_obj, 165 | loss=self.loss, 166 | metrics=self.metrics 167 | ) 168 | #self.model.summary() 169 | 170 | def fit(self, x, y, xv=None, yv=None, callbacks=None): 171 | 172 | if len(x.shape[1:])==2: 173 | x = np.expand_dims(x, axis=-1) 174 | if len(y.shape) == 1: 175 | output_shape = 1 176 | else: 177 | output_shape = y.shape[1] 178 | 179 | input_shape = x.shape[1:] 180 | self.build_neural_network(input_shape, output_shape) 181 | self.model.fit(x, y, batch_size=self.batch_size, epochs=self.epochs, 182 | verbose=self.verbose, validation_data=(xv, yv), callbacks=callbacks) 183 | return self.model.history.history 184 | 185 | def LastLayer(self, x): 186 | return 1.75 * K.tanh(x / 100) 187 | 188 | def predict(self, x): 189 | y_out = self.model.predict(np.expand_dims(x, axis=-1), 190 | batch_size=self.batch_size) 191 | return y_out 192 | 193 | def predict_proba(self, x): 194 | y_prob = self.model.predict(x, batch_size=self.batch_size) 195 | return y_prob 196 | 197 | def load_weights(self, weights): 198 | try: 199 | self.model.load_weights(weights) 200 | self.model.compile( 201 | optimizer=self.optimizer, 202 | loss=self.loss, 203 | metrics=self.metrics 204 | ) 205 | except: 206 | print("You must first call build_neural_network before loading weights. Exiting.") 207 | sys.exit(1) 208 | -------------------------------------------------------------------------------- /blog/site/memory.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# (12/22/20) Memory profiling python scripts with *memory_profiler*" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "The *memory_profiler* package provides line-by-line output of how much memory is allocated for a process, cell, script, or workflow.\n", 15 | "\n", 16 | "***\n", 17 | "Installing *memory_profiler* is very easy.\n", 18 | "\n", 19 | "## Pip\n", 20 | "\n", 21 | "`pip install -U memory_profiler`\n", 22 | "\n", 23 | "## Conda\n", 24 | "\n", 25 | "`conda config --add channels conda-forge`\n", 26 | "\n", 27 | "`conda install memory_profiler`\n", 28 | "\n", 29 | "***\n", 30 | "\n", 31 | "*Memory_profiler* isn't just easy to install, it's easy to implement into your scripts, jupyter notebooks, or entire workflows. Below are a few of the many ways *memory_profiler* can be implemented. See documentation [here](https://pypi.org/project/memory-profiler/) and [here](https://github.com/pythonprofilers/memory_profiler).\n" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "## 1. Simple decorator above any script function\n", 39 | "\n", 40 | "`@profile` above any function like in the script (`example.py`) below. You can set the precision with which the memory usage is reported.\n", 41 | "\n", 42 | "\n", 43 | "\n", 44 | "Run using the following command: `python -m memory_profiler example.py` and you will generate a file called `memory_profiler.log` containing the following output.\n", 45 | "\n", 46 | "" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "## 2. Decorators above sub-functions\n", 54 | "\n", 55 | "`@profile` above any sub-functions (`test.py`) called in your main script (`example.py`) below.\n", 56 | "\n", 57 | "\n", 58 | "\n", 59 | "\n", 60 | "`memory_profiler.log` will output the following:\n", 61 | "\n", 62 | "" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "## 3. Eliminate -m memory_profile flag by importing module into the script\n", 70 | "\n", 71 | "\n", 72 | "" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## 4. Track and plot memory as a function of time\n", 80 | "\n", 81 | "Import and decorate exactly as the above example. Then instead of python, run via `mprof run example.py`\n", 82 | "\n", 83 | "\n", 84 | "\n", 85 | "This method of running will still lead to the familiar output we've seen in previous ways of using *memory_profiler*.\n", 86 | "\n", 87 | "\n", 88 | "\n", 89 | "However, in addition to the line-by-line summary, the output of `mprof run` will be saved in a file that begins with `mprofile_` and ends in `.dat`. To create a plot out of this output, run `mprof plot --output=plot.png` and the following will be created:\n", 90 | "\n", 91 | "" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "## 5. A more complicated script example output\n", 99 | "\n", 100 | "\n", 101 | "\n", 102 | "" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "## 6. Jupyter notebook importing module\n", 110 | "\n", 111 | "When comparing or testing various functions, *memory_profiler* can be used by importing the module's various methods." 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 1, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "from memory_profiler import memory_usage" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 2, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "name": "stdout", 130 | "output_type": "stream", 131 | "text": [ 132 | "[45.046875, 45.046875, 45.046875, 45.046875, 45.046875]\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "mem_usage = memory_usage(-1, interval=.2, timeout=1)\n", 138 | "print(mem_usage)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 3, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "def aaa():\n", 148 | " a = [1] * (10 ** 6)\n", 149 | " b = [2] * (10 ** 7)\n", 150 | " del b\n", 151 | " return a" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 4, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "name": "stdout", 161 | "output_type": "stream", 162 | "text": [ 163 | "[129.24609375, 129.2578125, 136.890625, 136.890625, 136.890625, 136.890625]\n" 164 | ] 165 | } 166 | ], 167 | "source": [ 168 | "mem_usage = memory_usage(aaa, interval=.2, timeout=1)\n", 169 | "print(mem_usage)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "## 7. Jupyter notebook magic function\n", 177 | "\n", 178 | "Using *memory_profiler* can be as easy as implementing notebok magic functions, as shown below." 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 5, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "%load_ext memory_profiler" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 6, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "peak memory: 136.90 MiB, increment: 0.00 MiB\n" 200 | ] 201 | } 202 | ], 203 | "source": [ 204 | "%memit range(10000)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 7, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "name": "stdout", 214 | "output_type": "stream", 215 | "text": [ 216 | "peak memory: 136.91 MiB, increment: 0.01 MiB\n" 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "%memit range(10000000)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "## 8. Other methods of running and customizing *memory_profiler*\n", 229 | "\n", 230 | "The *memory_profile* module can be run via multi-processing and will output the memory usage of child processes. *Memory_profiler* can also be used to debug via a memory threshold. There are various ways of reporting the results of the memory profiling. Finally, there are various ways to customize the output and plotting of *memory_profiler*. For more customization and further running options, please see documentation [here](https://pypi.org/project/memory-profiler/) and [here](https://github.com/pythonprofilers/memory_profiler)." 231 | ] 232 | } 233 | ], 234 | "metadata": { 235 | "kernelspec": { 236 | "display_name": "Python 3", 237 | "language": "python", 238 | "name": "python3" 239 | }, 240 | "language_info": { 241 | "codemirror_mode": { 242 | "name": "ipython", 243 | "version": 3 244 | }, 245 | "file_extension": ".py", 246 | "mimetype": "text/x-python", 247 | "name": "python", 248 | "nbconvert_exporter": "python", 249 | "pygments_lexer": "ipython3", 250 | "version": "3.8.6" 251 | } 252 | }, 253 | "nbformat": 4, 254 | "nbformat_minor": 4 255 | } 256 | -------------------------------------------------------------------------------- /blog/site/optuna_mariadb.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# (2/26/21) A short primer on using Optuna and ECHO to interact with a sql database" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### 1. We have access to a MariaDB located on [thunder](https://www2.cisl.ucar.edu/resources/computational-systems/thunder-user-guide)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "MariaDB: MySQL relational database management system.\n", 22 | "\n", 23 | "The MariaDB server is accessible from an NCAR IP address, but you cannot login to MariaDB as root remotely. To interact with the database as root, you would need to ssh to thunder and from there you will be able to login to MariaDB as root to setup/manage the database. This will not affect the interaction between optuna and the database, but we will need root in order to manage the database (future).\n", 24 | "\n", 25 | "In this blog, we have a database named \"optuna\". For demonstrating purposes, we imagine that a user \"icarus\" exists. If you are at NCAR and are experimenting with mysql + optuna, you may email John Schreck about obtaining access. Ordinarily, to get onto thunder, you will use your NCAR password (same as for casper, cheyenne, etc)." 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### 2. Optuna does not have much to say with regards to its sql support. \n", 33 | "\n", 34 | "In general, this interaction is low-level, while your interaction with optuna is much higher. To that end, the simplest way to go about managing your studies is to use the create_study and delete_study methods. \n", 35 | "\n", 36 | "You may continue to use the sqlite \"storage\", but be warned that once 1000 trials are saved to the named study, the performance will degrade quickly. This is especially apparent when running the hyperparameter importance metrics, which query the database and train a tree model on the fly." 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "### 3. Example: Using create_study and delete_study" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "First, lets see what tables are in the \"optuna\" database on thunder (from terminal):" 51 | ] 52 | }, 53 | { 54 | "cell_type": "raw", 55 | "metadata": {}, 56 | "source": [ 57 | "mysql -u icarus -p -h thunder.ucar.edu -D optuna -e 'show tables'" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "(I shared an ssh key, hence not having to use Duo. Details at the bottom of this tutorial)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "Next, lets list the study names user \"schreck\" has saved into optuna: " 72 | ] 73 | }, 74 | { 75 | "cell_type": "raw", 76 | "metadata": {}, 77 | "source": [ 78 | "mysql -u icarus -p -h thunder.ucar.edu -D optuna -e 'select * from studies'" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "Now we create a new study named \"example\":" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 1, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "import optuna" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 2, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "study = optuna.create_study(\n", 104 | " study_name=\"example\", \n", 105 | " storage=\"mysql://icarus:password@thunder.ucar.edu/optuna\"\n", 106 | ")" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "Confirm that the study was actually created by repeating the command from earlier: " 114 | ] 115 | }, 116 | { 117 | "cell_type": "raw", 118 | "metadata": {}, 119 | "source": [ 120 | "mysql -u icarus -p -h thunder.ucar.edu -D optuna -e 'select * from studies'" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "Next, in your hyperparameters.yml configuration file, we simply point to the database as follows under the optuna field:" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 3, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "study_name: \"example\"\n", 137 | "storage: \"mysql://icarus:password@thunder.ucar.edu/optuna\"" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "You don't have to worry about entering your sql password, it is already contained in the storage link! Since we are on an NCAR server, we also do not need to use Duo, although this will be changing in the near future. The forth-coming additional security will likely become problematic, but we will deal with that later. " 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "Note that you don't have to create a study beforehand if it does not exist, the optimize.py script that is used to launch a hyperparameter study, contained in the [ECHO](https://github.com/NCAR/aiml-utils/tree/master/aimlutils/echo) package, will call create_study for you:" 152 | ] 153 | }, 154 | { 155 | "cell_type": "raw", 156 | "metadata": {}, 157 | "source": [ 158 | "python $echo/optimize.py hyperparameter.yml model.yml" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "For now, when its time to delete a study from our optuna database, simply call the optuna method delete_study:" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 4, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "optuna.delete_study(\n", 175 | " study_name=\"example\", \n", 176 | " storage=\"mysql://icarus:password@thunder.ucar.edu/optuna\"\n", 177 | ")" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "Let us double check that it was actaully removed:" 185 | ] 186 | }, 187 | { 188 | "cell_type": "raw", 189 | "metadata": {}, 190 | "source": [ 191 | "mysql -u icarus -p -h thunder.ucar.edu -D optuna -e 'select * from studies'" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "Ordinarily, you set reload = 0 in your hyperparameters.yml file when starting a new study. If the study name already exists, optimize.py/run.py will fail with an error message (I will not delete or overwrite things automatically. That job is left up to you).\n", 199 | "\n", 200 | "When using the sqlite database solution, you simply delete that file. For sql support, the script will still complain at you, but a new parser option has been added that will facilitate the delete_study call:" 201 | ] 202 | }, 203 | { 204 | "cell_type": "raw", 205 | "metadata": {}, 206 | "source": [ 207 | "The study {study_name} already exists in storage and reload was False.\n", 208 | "Delete it from {storage}, and try again or rerun this script\n", 209 | "with the flag: --override 1" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "E.g. you run:" 217 | ] 218 | }, 219 | { 220 | "cell_type": "raw", 221 | "metadata": {}, 222 | "source": [ 223 | "python $echo/optimize.py hyperparameter.yml model.yml --override 1" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "And the study_name will be deleted from the storage container. Note that its gone forever, so be extra careful that this is what you intended. " 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "### 4. For more, checkout [this tutorial](https://www.guru99.com/data-warehousing-tutorial.html) on data warehousing." 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "Feel free to email me (John Schreck, schreck@ucar.edu) with any questions / mistakes / whatever!" 245 | ] 246 | } 247 | ], 248 | "metadata": { 249 | "kernelspec": { 250 | "display_name": "Python 3", 251 | "language": "python", 252 | "name": "python3" 253 | }, 254 | "language_info": { 255 | "codemirror_mode": { 256 | "name": "ipython", 257 | "version": 3 258 | }, 259 | "file_extension": ".py", 260 | "mimetype": "text/x-python", 261 | "name": "python", 262 | "nbconvert_exporter": "python", 263 | "pygments_lexer": "ipython3", 264 | "version": "3.8.6" 265 | } 266 | }, 267 | "nbformat": 4, 268 | "nbformat_minor": 4 269 | } 270 | -------------------------------------------------------------------------------- /aimlutils/echo/report.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | 4 | import os 5 | import sys 6 | import yaml 7 | import optuna 8 | import logging 9 | import pandas as pd 10 | import matplotlib as mpl 11 | import matplotlib.pyplot as plt 12 | from argparse import ArgumentParser 13 | from typing import Dict 14 | 15 | 16 | def args(): 17 | parser = ArgumentParser(description= 18 | "report.py: Get the status/progress of a hyperparameter study" 19 | ) 20 | 21 | parser.add_argument("hyperparameter", type=str, help= 22 | "Path to the hyperparameter configuration containing your inputs." 23 | ) 24 | 25 | parser.add_argument( 26 | "-p", 27 | "--plot", 28 | dest="plot", 29 | type=str, 30 | default=False, 31 | help="A yaml structured file containining settings for matplotlib/pylab objects" 32 | ) 33 | 34 | parser.add_argument( 35 | "-t", 36 | "--n_trees", 37 | dest="n_trees", 38 | type=int, 39 | default=64, 40 | help="The number of trees to use in parameter importance models. Default is 64." 41 | ) 42 | 43 | parser.add_argument( 44 | "-d", 45 | "--max_depth", 46 | dest="max_depth", 47 | type=int, 48 | default=64, 49 | help="The maximum depth to use in parameter importance models. Default is 64." 50 | ) 51 | 52 | return vars(parser.parse_args()) 53 | 54 | 55 | def update_figure(fig: mpl.figure.Figure, 56 | params: Dict[str, str] = False) -> mpl.figure.Figure: 57 | """ 58 | Updates some mpl Figure parameters. Only limited support for now. 59 | In a future version the optuna plots will be moved here 60 | and expanded customization will be enabled. 61 | 62 | Returns a matplotlib Figure 63 | 64 | Inputs: 65 | fig: a matplotlib Figure 66 | params: a dictionary containing mpl fields 67 | """ 68 | 69 | if params is False: 70 | fig.set_yscale("log") 71 | mpl.rcParams.update({"figure.dpi": 300}) 72 | else: 73 | if "rcparams" in params: 74 | mpl.rcParams.update(**params["rcparams"]) 75 | if "set_xlim" in params: 76 | fig.set_xlim(params["set_xlim"]) 77 | if "set_ylim" in params: 78 | fig.set_ylim(params["set_ylim"]) 79 | if "set_xscale" in params: 80 | fig.set_xscale(params["set_xscale"]) 81 | if "set_yscale" in params: 82 | fig.set_yscale(params["set_yscale"]) 83 | 84 | plt.tight_layout() 85 | return fig 86 | 87 | 88 | def plot_wrapper(study: optuna.study.Study, 89 | identifier: str, 90 | save_path: str, 91 | params: Dict[str, str] = False): 92 | 93 | 94 | """ 95 | Creates and saves an intermediate values plot. 96 | 97 | Does not return. 98 | 99 | Inputs: 100 | study: an Optuna study object 101 | identifier: a string identifier for selecting the optuna plot method 102 | save_path: a path where the plot should be saved 103 | params: a dictionary containing mpl fields. Default = False 104 | """ 105 | 106 | flag = isinstance(params, dict) 107 | if flag and identifier in params: 108 | params = params[identifier] 109 | else: 110 | flag = False 111 | 112 | # Use optunas mpl object for now 113 | if identifier == "intermediate_values": 114 | fig = optuna.visualization.matplotlib.plot_intermediate_values(study) 115 | elif identifier == "optimization_history": 116 | fig = optuna.visualization.matplotlib.plot_optimization_history(study) 117 | elif identifier == "pareto_front": 118 | fig = optuna.multi_objective.visualization.plot_pareto_front(study) 119 | else: 120 | raise OSError(f"An incorrect optuna plot identifier {identifier} was used") 121 | 122 | fig = update_figure(fig, params) 123 | 124 | if flag and "save_path" in params: 125 | save_path = params["save_path"] 126 | 127 | figure_save_path = os.path.join(save_path, f"{identifier}.pdf") 128 | plt.savefig(figure_save_path) 129 | 130 | logging.info( 131 | f"Saving the {identifier} plot to file at {figure_save_path}" 132 | ) 133 | 134 | 135 | if __name__ == "__main__": 136 | 137 | if len(sys.argv) < 2: 138 | raise OSError( 139 | "Usage: python report.py hyperparameter.yml [optional arguments]" 140 | "To see the available parser options: python report.py --help" 141 | ) 142 | 143 | args_dict = args() 144 | 145 | hyper_config = args_dict.pop("hyperparameter") 146 | plot_config = args_dict.pop("plot") if "plot" in args_dict else False 147 | 148 | # Options for the parameter importance tree models 149 | n_trees = args_dict.pop("n_trees") 150 | max_depth = args_dict.pop("max_depth") 151 | 152 | # Check if hyperparameter config file exists 153 | if os.path.isfile(hyper_config): 154 | with open(hyper_config) as f: 155 | hyper_config = yaml.load(f, Loader=yaml.FullLoader) 156 | else: 157 | raise OSError( 158 | f"Hyperparameter optimization config file {hyper_config} does not exist" 159 | ) 160 | 161 | if plot_config is not False: 162 | if os.path.isfile(plot_config): 163 | with open(plot_config) as p: 164 | plot_config = yaml.load(p, Loader=yaml.FullLoader) 165 | else: 166 | raise OSError( 167 | f"Hyperparameter optimization plot file {plot_config} does not exist" 168 | ) 169 | 170 | 171 | # Set up a logger 172 | root = logging.getLogger() 173 | root.setLevel(logging.INFO) 174 | formatter = logging.Formatter('%(levelname)s:%(name)s:%(message)s') 175 | 176 | # Stream output to stdout 177 | ch = logging.StreamHandler() 178 | ch.setLevel(logging.INFO) 179 | ch.setFormatter(formatter) 180 | root.addHandler(ch) 181 | 182 | save_path = hyper_config["optuna"]["save_path"] 183 | study_name = hyper_config["optuna"]["study_name"] 184 | storage = hyper_config["optuna"]["storage"] 185 | reload_study = bool(hyper_config["optuna"]["reload"]) 186 | cached_study = f"{save_path}/{study_name}" 187 | 188 | direction = hyper_config["optuna"]["direction"] 189 | single_objective = isinstance(direction, str) 190 | 191 | # Load from database 192 | #storage = f'postgresql+psycopg2://john:schreck@localhost/{cached_study}' 193 | #storage = f"sqlite:///{cached_study}" 194 | 195 | if single_objective: 196 | study = optuna.load_study(study_name=study_name, storage=storage) 197 | else: 198 | study = optuna.multi_objective.study.load_study( 199 | study_name=study_name, 200 | storage=storage 201 | ) 202 | 203 | # Check a few other stats 204 | pruned_trials = [ 205 | t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED 206 | ] 207 | complete_trials = [ 208 | t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE 209 | ] 210 | 211 | logging.info(f'Number of requested trials per worker: {hyper_config["optuna"]["n_trials"]}') 212 | logging.info(f"Number of trials in the database: {len(study.trials)}") 213 | logging.info(f"Number of pruned trials: {len(pruned_trials)}") 214 | logging.info(f"Number of completed trials: {len(complete_trials)}") 215 | 216 | if len(complete_trials) == 0: 217 | logging.info("There are no complete trials in this study.") 218 | logging.info("Wait until the workers finish a few trials and try again.") 219 | sys.exit() 220 | 221 | logging.info(f"Best trial: {study.best_trial.value}") 222 | 223 | if len(complete_trials) > 1: 224 | f_importance = optuna.importance.FanovaImportanceEvaluator( 225 | n_trees = n_trees, max_depth = max_depth).evaluate(study=study) 226 | logging.info(f"fANOVA parameter importance {dict(f_importance)}") 227 | mdi_importance = optuna.importance.MeanDecreaseImpurityImportanceEvaluator( 228 | n_trees = n_trees, max_depth = max_depth).evaluate(study=study) 229 | logging.info(f"Mean decrease impurity (MDI) parameter importance {dict(mdi_importance)}") 230 | 231 | logging.info("Best parameters in the study:") 232 | for param, val in study.best_params.items(): 233 | logging.info(f"{param}: {val}") 234 | 235 | if len(study.trials) < hyper_config["optuna"]["n_trials"]: 236 | logging.warning( 237 | "Not all of the trials completed due to the wall-time." 238 | ) 239 | logging.warning( 240 | "Set reload = 1 in the hyperparameter config and resubmit some more workers to finish!" 241 | ) 242 | 243 | save_fn = os.path.join(save_path, f"{study_name}.csv") 244 | logging.info(f"Saving the results of the study to file at {save_fn}") 245 | study.trials_dataframe().to_csv(save_fn, index = None) 246 | 247 | if single_objective: 248 | 249 | # Plot the optimization_history 250 | plot_wrapper(study, "optimization_history", save_path, plot_config) 251 | 252 | # Plot the intermediate_values 253 | plot_wrapper(study, "intermediate_values", save_path, plot_config) 254 | 255 | else: 256 | # Plot the pareto front 257 | plot_wrapper(study, "pareto_front", save_path, plot_config) 258 | -------------------------------------------------------------------------------- /aimlutils/echo/run.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | 4 | from aimlutils.echo.src.samplers import samplers 5 | from aimlutils.utils.gpu import gpu_report 6 | import importlib.machinery 7 | import pandas as pd 8 | import numpy as np 9 | import logging 10 | import optuna 11 | import time 12 | import glob 13 | import yaml 14 | import sys 15 | import os 16 | 17 | start_the_clock = time.time() 18 | 19 | 20 | def get_sec(time_str): 21 | """Get Seconds from time.""" 22 | h, m, s = time_str.split(':') 23 | return int(h) * 3600 + int(m) * 60 + int(s) 24 | 25 | # References 26 | # https://github.com/optuna/optuna/issues/1365 27 | # https://docs.dask.org/en/latest/setup/hpc.html 28 | # https://dask-cuda.readthedocs.io/en/latest/worker.html 29 | # https://optuna.readthedocs.io/en/stable/tutorial/004_distributed.html#distributed 30 | 31 | if len(sys.argv) != 3: 32 | print( 33 | "Usage: python run.py hyperparameter.yml model.yml" 34 | ) 35 | sys.exit() 36 | 37 | # Set up a logger 38 | root = logging.getLogger() 39 | root.setLevel(logging.DEBUG) 40 | formatter = logging.Formatter('%(levelname)s:%(name)s:%(message)s') 41 | 42 | # Stream output to stdout 43 | ch = logging.StreamHandler() 44 | ch.setLevel(logging.INFO) 45 | ch.setFormatter(formatter) 46 | root.addHandler(ch) 47 | 48 | ################################################################ 49 | 50 | # Check if hyperparameter config file exists 51 | if os.path.isfile(sys.argv[1]): 52 | with open(sys.argv[1]) as f: 53 | hyper_config = yaml.load(f, Loader=yaml.FullLoader) 54 | else: 55 | raise OSError( 56 | f"Hyperparameter optimization config file {sys.argv[1]} does not exist" 57 | ) 58 | 59 | # Check if the wall-time exists 60 | if "slurm" in hyper_config: 61 | if "t" not in hyper_config["slurm"]["batch"]: 62 | raise OSError( 63 | "You must supply a wall time in the hyperparameter config at slurm:batch:t" 64 | ) 65 | if "pbs" in hyper_config: 66 | if not any([("walltime" in x) for x in hyper_config["pbs"]["batch"]["l"]]): 67 | raise OSError( 68 | "You must supply a wall time in the hyperparameter config at pbs:bash:l" 69 | ) 70 | 71 | # Check if model config file exists 72 | if os.path.isfile(sys.argv[2]): 73 | with open(sys.argv[2]) as f: 74 | model_config = yaml.load(f, Loader=yaml.FullLoader) 75 | else: 76 | raise OSError( 77 | f"Model config file {sys.argv[1]} does not exist" 78 | ) 79 | 80 | # Copy the optuna details to the model config 81 | model_config["optuna"] = hyper_config["optuna"] 82 | 83 | # Check if path to objective method exists 84 | if os.path.isfile(model_config["optuna"]["objective"]): 85 | loader = importlib.machinery.SourceFileLoader( 86 | "custom_objective", 87 | model_config["optuna"]["objective"] 88 | ) 89 | mod = loader.load_module() 90 | from custom_objective import Objective 91 | else: 92 | raise OSError( 93 | f'The objective file {model_config["optuna"]["objective"]}\ 94 | does not exist' 95 | ) 96 | 97 | # Check if the optimization metric direction is supported 98 | direction = model_config["optuna"]["direction"] 99 | single_objective = isinstance(direction, str) 100 | 101 | if single_objective: 102 | if direction not in ["maximize", "minimize"]: 103 | raise OSError( 104 | f"Optimizer direction {direction} not recognized. \ 105 | Choose from maximize or minimize" 106 | ) 107 | else: 108 | for direc in direction: 109 | if direc not in ["maximize", "minimize"]: 110 | raise OSError( 111 | f"Optimizer direction {direc} not recognized. \ 112 | Choose from maximize or minimize" 113 | ) 114 | 115 | logging.info(f"Direction of optimization {direction}") 116 | 117 | ### Add other config checks 118 | 119 | ################################################################ 120 | 121 | # Stream output to log file 122 | if "log" in hyper_config: 123 | savepath = hyper_config["log"]["save_path"] if "save_path" in hyper_config["log"] else "log.txt" 124 | mode = "a+" if bool(hyper_config["optuna"]["reload"]) else "w" 125 | fh = logging.FileHandler(savepath, 126 | mode=mode, 127 | encoding='utf-8') 128 | fh.setLevel(logging.DEBUG) 129 | fh.setFormatter(formatter) 130 | root.addHandler(fh) 131 | 132 | # Get the path to save all the data 133 | save_path = model_config["optuna"]["save_path"] 134 | logging.info(f"Saving optimization details to {save_path}") 135 | 136 | # Grab the metric 137 | if isinstance(model_config["optuna"]["metric"], list): 138 | metric = [str(m) for m in model_config["optuna"]["metric"]] 139 | else: 140 | metric = str(model_config["optuna"]["metric"]) 141 | logging.info(f"Using metric {metric}") 142 | 143 | # Get list of devices and initialize the Objective class 144 | if bool(model_config["optuna"]["gpu"]): 145 | try: 146 | gpu_report = sorted( 147 | gpu_report().items(), 148 | key = lambda x: x[1], 149 | reverse = True 150 | ) 151 | device = gpu_report[0][0] 152 | except: 153 | logging.warning( 154 | "The gpu is not responding to a call from nvidia-smi.\ 155 | Setting gpu device = 0, but this may fail." 156 | ) 157 | device = 0 158 | else: 159 | device = 'cpu' 160 | logging.info(f"Using device {device}") 161 | 162 | ################################################################ 163 | 164 | # Initialize the study object 165 | study_name = model_config["optuna"]["study_name"] 166 | reload_study = bool(model_config["optuna"]["reload"]) 167 | 168 | # cached_study = f"{save_path}/{study_name}" 169 | 170 | # if not os.path.isfile(cached_study) or not reload_study: 171 | # load_if_exists = False 172 | # elif not reload_study: 173 | # os.remove(cached_study) 174 | # load_if_exists = reload_study 175 | # else: 176 | # load_if_exists = True 177 | 178 | # Identify the storage location 179 | storage = model_config["optuna"]["storage"] #f"sqlite:///{cached_study}" 180 | 181 | # Initialize the sampler 182 | if "sampler" not in hyper_config["optuna"]: 183 | if single_objective: # single-objective 184 | sampler = optuna.samplers.TPESampler() 185 | else: # multi-objective equivalent of TPESampler 186 | sampler = optuna.multi_objective.samplers.MOTPEMultiObjectiveSampler() 187 | else: 188 | sampler = samplers(hyper_config["optuna"]["sampler"]) 189 | 190 | # Load or initiate study 191 | if single_objective: 192 | study = optuna.create_study(study_name=study_name, 193 | storage=storage, 194 | sampler=sampler, 195 | direction=direction, 196 | load_if_exists=True) 197 | else: 198 | study = optuna.multi_objective.study.create_study( 199 | study_name=study_name, 200 | storage=storage, 201 | sampler=sampler, 202 | directions=direction, 203 | load_if_exists=True 204 | ) 205 | logging.info(f"Loaded study {study_name} located at {storage}") 206 | 207 | # Initialize objective function 208 | objective = Objective(model_config, metric, device) 209 | 210 | # Optimize it 211 | logging.info( 212 | f'Running optimization for {model_config["optuna"]["n_trials"]} trials' 213 | ) 214 | 215 | # Get the cluster job wall-time 216 | if "slurm" in hyper_config: 217 | wall_time = hyper_config["slurm"]["batch"]["t"] 218 | elif "pbs" in hyper_config: 219 | wall_time = False 220 | for option in hyper_config["pbs"]["batch"]["l"]: 221 | if "walltime" in option: 222 | wall_time = option.split("walltime=")[-1] 223 | break 224 | if wall_time is False: 225 | logging.warning("Could not process the walltime for run.py. Assuming 12 hours.") 226 | wall_time = "12:00:00" 227 | wall_time_secs = get_sec(wall_time) 228 | 229 | logging.info( 230 | f"This script will run for a fraction of the wall-time of {wall_time} and try to die without error" 231 | ) 232 | 233 | run_times = [] 234 | estimated_run_time = wall_time_secs 235 | 236 | # study.optimize( 237 | # objective, 238 | # n_trials = int(model_config["optuna"]["n_trials"]), 239 | # timeout = estimated_run_time, 240 | # catch = (ValueError,) 241 | # ) 242 | 243 | # Testing out way to stop running trials if too close to wall-time. 244 | # Update to computing the mean of the run times of all completed trials in the database. 245 | 246 | for iteration in range(int(model_config["optuna"]["n_trials"])): 247 | 248 | try: 249 | start_time = time.time() 250 | study.optimize( 251 | objective, 252 | n_trials = 1, 253 | timeout = estimated_run_time, 254 | #catch = (ValueError,) 255 | ) 256 | end_time = time.time() 257 | run_times.append(end_time - start_time) 258 | 259 | except KeyboardInterrupt: 260 | logging.warning( 261 | f"Recieved signal to die from keyboard. Exiting." 262 | ) 263 | break 264 | 265 | except Exception as E: 266 | logging.warning( 267 | f"Dying early due to error {E}" 268 | ) 269 | break 270 | 271 | if len(run_times) > 1: 272 | average_run_time = np.mean(run_times) 273 | sigma_run_time = np.std(run_times) if len(run_times) > 2 else 0.0 274 | estimated_run_time = average_run_time + 2 * sigma_run_time 275 | time_left = wall_time_secs - (time.time() - start_the_clock) 276 | if time_left < estimated_run_time: 277 | logging.warning( 278 | f"Dying early as estimated run-time exceeds the time remaining on this node." 279 | ) 280 | break -------------------------------------------------------------------------------- /aimlutils/echo/README.ipynb: -------------------------------------------------------------------------------- 1 | # hyper_opt: A distributed multi-gpu hyperparameter optimization package build with optuna 2 | 3 | ### Usage 4 | 5 | python optimize.py hyperparameters.yml model.yml 6 | 7 | ### Dependencies 8 | 9 | There are three files that must be supplied to use the optimize script: 10 | 11 | * A custom objective function that performs the model training and returns the metric value to be optimized. 12 | 13 | * A configuration file specifying the hyperparameter optimization settings. 14 | 15 | * A model configuration file that contains the available hyperparameters that will get optimized. 16 | 17 | ### Custom objective class 18 | The user must supply a custom **Objective** class (objective.py) that is composed with an internal **BaseObjective** class (base_objective.py), and contains a method named **train** that returns the value of the optimization metric in a dictionary. See the examples directory for both torch and Keras examples. Note that the objective class only needs to return the metric value (in dictionary form) and does not depend on the machine learning library used. For example, a simple Objective class template will have the following structure: 19 | 20 | from aimlutils.hyper_opt.base_objective import * 21 | 22 | class Objective(BaseObjective): 23 | 24 | def __init__(self, study, config, metric = "val_loss", device = "cpu"): 25 | 26 | # Initialize the base class 27 | BaseObjective.__init__(self, study, config, metric, device) 28 | 29 | def train(self, trial, conf): 30 | 31 | # Make any custom edits to the model conf before using it to train a model. 32 | conf = custom_updates(trial, conf) 33 | 34 | ... 35 | 36 | result = Model.fit(...) 37 | 38 | results_dictionary = { 39 | "val_loss": result["val_loss"], 40 | "loss": result["loss"], 41 | ... 42 | "val_accuracy": result["val_accuracy"] 43 | } 44 | return results_dictionary 45 | 46 | The BaseObjective must be initialized using the input parameters to the Objective (they must match!). The metric used to toggle model performance must always be in the results dictionary, while other metrics that the user may want to track will also be stored and saved so long as they are included in the results dictionary. The base class will call the train method from its thunder **__call__** method, and finishes up by calling a save method that takes care of writing the metric(s) details to file. Check out the script run.py to see how things are called. 47 | 48 | Note that the first line in the train method states that any custom changes to the model configuration (conf) must be done here. If custom changes are required, the user must supply a method named **custom_updates** in addition to the Objective class (save both in the same script). See also the section **Custom configuration edits** below for more details. 49 | 50 | ### Hyperparameter optimizer configuration 51 | There are three main fields, log, slurm, and optuna, and variable subfields within each field. The log field allows us to save a file for printing messages and warnings that are placed in areas throughout the package. The slurm field allows the user to specify how many GPU nodes should be used, and supports any slurm setting. The optuna field allows the user to configure the optimization procedure, including specifying which parameters will be used, as well as the performance metric. For example, consider the configuration settings: 52 | 53 | * log 54 | + save_path: "path/to/data/log.txt" 55 | * slurm 56 | + jobs: 20 57 | + batch: 58 | + account: "NAML0001" 59 | + gres: "gpu:v100:1" 60 | + mem: "128G" 61 | + n: 8 62 | + t: "12:00:00" 63 | + J: "hyper_opt" 64 | + o: "hyper_opt.out" 65 | + e: "hyper_opt.err" 66 | * optuna 67 | + name: "holodec_optimization.db" 68 | + reload: 0 69 | + objective: "examples/torch_objective.py" 70 | + metric: "val_loss" 71 | + direction: "minimize" 72 | + n_trials: 500 73 | + gpu: True 74 | + save_path: 'test' 75 | + sampler: 76 | + type: "TPESampler" 77 | + parameters: 78 | + num_dense: 79 | + type: "int" 80 | + settings: 81 | + name: "num_dense" 82 | + low: 0 83 | + high: 10 84 | + dropout: 85 | + type: "float" 86 | + settings: 87 | + name: "dr" 88 | + low: 0.0 89 | + high: 0.5 90 | + **optimizer:learning_rate**: 91 | + type: "loguniform" 92 | + settings: 93 | + name: "lr" 94 | + low: 0.0000001 95 | + high: 0.01 96 | 97 | The subfields within the optuna field have the following functionality: 98 | 99 | * name: ($\color{red}{string}$) The name of the study. 100 | * reload: ($\color{red}{bool}$) Whether to continue using a previous study (True) or to initialize a new study (False). If your initial number of workers do not reach the number of trials and you wish to resubmit, set to True. 101 | * objective: ($\color{red}{string}$) The path to the user-supplied objective class (it must be named objective.py) 102 | * metric: ($\color{red}{string}$) The metric to be used to determine the model performance. 103 | * direction: ($\color{red}{string}$) Indicates which direction the metric must go to represent improvement (pick from maximimize or minimize) 104 | * n_trials: ($\color{red}{int}$) The number of trials in the study. 105 | * gpu: ($\color{red}{bool}$) Use the gpu or cpu. 106 | * save_path: ($\color{red}{string}$) Directory path where data will be saved. 107 | * sampler 108 | + type: ($\color{red}{string}$) Choose how optuna will do parameter estimation. The default choice both here and in optuna is the [Tree-structured Parzen Estimator Approach](https://towardsdatascience.com/a-conceptual-explanation-of-bayesian-model-based-hyperparameter-optimization-for-machine-learning-b8172278050f), [e.g. TPESampler](https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf). See the optuna documentation for the different options. For some samplers (e.g. GridSearch) additional fields may be included (e.g. search_space). 109 | * parameters 110 | + type: ($\color{red}{string}$) Option to select an optuna trial setting. See the [optuna Trial documentation](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html?highlight=suggest#optuna.trial.Trial.suggest_uniform) for what is available. Currently, this package supports the available options from optuna: "categorical", "discrete_uniform", "float", "int", "loguniform", and "uniform". 111 | + settings: This field allows you to specify any settings that accompany the optuna trial type. In the example above, the named num_dense parameter is stated to be an integer with values ranging from 0 to 10. To see all the available options, consolt the [optuna Trial documentation](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html?highlight=suggest#optuna.trial.Trial.suggest_uniform) 112 | 113 | ### Model configuration 114 | The model configuration file should be the one you have been using up to this point to train models. This package will take the suggested hyperparameters from an optuna trial and make changes to the model configuration. This can either be done automatically with this package, or the user may supply an additional method for making custom changes. For example, consider the (truncated) configuration for training a model to predict hologram properties with the holodec data: 115 | 116 | * model: 117 | + image_channels: 1 118 | + hidden_dims: [3, 94, 141, 471, 425, 1122] 119 | + z_dim: 1277 120 | + dense_hidden_dims: [1000] 121 | + dense_dropouts: [0.0] 122 | + tasks: ["x", "y", "z", "d", "binary"] 123 | + **optimizer**: 124 | * type: "lookahead-diffgrad" 125 | * **learning_rate**: 0.000631 126 | * weight_decay: 0.0 127 | + trainer: 128 | * start_epoch: 0 129 | * epochs: 1 130 | * clip: 1.0 131 | * alpha: 1.0 132 | * beta: 0.1 133 | * path_save: "test" 134 | 135 | The model configuration can be automatically updated using this package if the name of the parameter specified in the hyperparameter configuration, optuna.parameters can be used as a nested lookup key in the model configuration's nested dictionary. For example, observe in the hyperparameter configuration file that the named parameter **optimizer:learning_rate** contains a colon, that is downstream used to split the name into multiple keys that allow us to, starting from the top of the nested tree in the model configuration, work our way down until the field is located and the trial-suggested value is substituted in. In this example, the split keys are ["optimizer", "learning_rate"]. 136 | 137 | This scheme will work in general as long as the named parameter in optuna.parameters uses : as the separator, and once split, the resulting list can be used to locate the relevant field in the model configuration. 138 | 139 | 140 | ### Custom configuration edits 141 | 142 | The user can also supply rules for updating the model configuration file, by including a method named **custom_updates**, which will make the desired changes to the configuration file with optuna trail parameter guesses. 143 | 144 | In the example configurations described above, the hyperparameter configuration contained an optuna.parameters field "num_dense," but this field is not present in the model configuration. There is however a dense_hiddden_dims field in the model configuration that contains a list of the layer sizes in the model (where the number of layers is the length of the list). In the example, just one layer speficied but we want to vary that number. To use the num_dense hyperparameter from the hyperparameter configuration file, we need to create the following custom method: 145 | 146 | def custom_updates(trial, conf): 147 | 148 | # Get list of hyperparameters from the config 149 | hyperparameters = conf["optuna"]["parameters"] 150 | 151 | # Now update some via custom rules 152 | num_dense = trial.suggest_discrete_uniform(**hyperparameters["num_dense"]) 153 | 154 | # Update the config based on optuna's suggestion 155 | conf["model"]["dense_hidden_dims"] = [1000 for k in range(num_dense)] 156 | 157 | return conf 158 | 159 | This custom method should be called first thing in the custom Objective.train method. You may have noticed that the configuration (named conf) contains both hyperparameter and model fields. This package will copy the hyperparameter fields to the model configuration for convenience, so that we can reduce the total number of class and method dependencies (which helps me keep the code generalized). This occurs in the run.py script. -------------------------------------------------------------------------------- /aimlutils/torch/optimizers/optimizers.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import logging 4 | import torch.nn as nn 5 | import itertools as it 6 | from typing import Dict 7 | 8 | 9 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 10 | 11 | 12 | def LoadOptimizer(optimizer_type: str, parameters: Dict[str, float], learning_rate: float = 0.001, weight_decay=0.0): 13 | 14 | if optimizer_type == "lookahead-diffgrad": 15 | optimizer = LookaheadDiffGrad( 16 | parameters, lr=learning_rate, weight_decay=weight_decay) 17 | elif optimizer_type == "diffgrad": 18 | optimizer = DiffGrad(parameters, lr=learning_rate, 19 | weight_decay=weight_decay) 20 | elif optimizer_type == "lookahead-radam": 21 | optimizer = LookaheadRAdam( 22 | parameters, lr=learning_rate, weight_decay=weight_decay) 23 | elif optimizer_type == "radam": 24 | optimizer = RAdam(parameters, lr=learning_rate, 25 | weight_decay=weight_decay) 26 | elif optimizer_type == "adam": 27 | optimizer = torch.optim.Adam( 28 | parameters, lr=learning_rate, weight_decay=weight_decay) 29 | elif optimizer_type == "sgd": 30 | optimizer = torch.optim.SGD( 31 | parameters, lr=learning_rate, weight_decay=weight_decay) 32 | else: 33 | logging.warning( 34 | f"Optimzer type {optimizer_type} is unknown. Exiting with error." 35 | ) 36 | sys.exit(1) 37 | 38 | logger.info( 39 | f"Loaded the {optimizer_type} optimizer with learning rate {learning_rate} and L2 penalty {weight_decay}" 40 | ) 41 | return optimizer 42 | 43 | 44 | class DiffGrad(torch.optim.Optimizer): 45 | # Original source: https://github.com/shivram1987/diffGrad/blob/master/diffGrad.py 46 | r"""Implements diffGrad algorithm. It is modified from the pytorch implementation of Adam. 47 | It has been proposed in `diffGrad: An Optimization Method for Convolutional Neural Networks`_. 48 | Arguments: 49 | params (iterable): iterable of parameters to optimize or dicts defining 50 | parameter groups 51 | lr (float, optional): learning rate (default: 1e-3) 52 | betas (Tuple[float, float], optional): coefficients used for computing 53 | running averages of gradient and its square (default: (0.9, 0.999)) 54 | eps (float, optional): term added to the denominator to improve 55 | numerical stability (default: 1e-8) 56 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 57 | amsgrad (boolean, optional): whether to use the AMSGrad variant of this 58 | algorithm from the paper `On the Convergence of Adam and Beyond`_ 59 | (default: False) 60 | .. _diffGrad: An Optimization Method for Convolutional Neural Networks: 61 | https://arxiv.org/abs/1909.11015 62 | .. _Adam\: A Method for Stochastic Optimization: 63 | https://arxiv.org/abs/1412.6980 64 | .. _On the Convergence of Adam and Beyond: 65 | https://openreview.net/forum?id=ryQu7f-RZ 66 | """ 67 | 68 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, version=0, weight_decay=0): 69 | if not 0.0 <= lr: 70 | raise ValueError("Invalid learning rate: {}".format(lr)) 71 | if not 0.0 <= eps: 72 | raise ValueError("Invalid epsilon value: {}".format(eps)) 73 | if not 0.0 <= betas[0] < 1.0: 74 | raise ValueError( 75 | "Invalid beta parameter at index 0: {}".format(betas[0])) 76 | if not 0.0 <= betas[1] < 1.0: 77 | raise ValueError( 78 | "Invalid beta parameter at index 1: {}".format(betas[1])) 79 | 80 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 81 | 82 | super().__init__(params, defaults) 83 | 84 | # save version 85 | self.version = version 86 | 87 | def __setstate__(self, state): 88 | super().__setstate__(state) 89 | 90 | def step(self, closure=None): 91 | """Performs a single optimization step. 92 | Arguments: 93 | closure (callable, optional): A closure that reevaluates the model 94 | and returns the loss. 95 | """ 96 | loss = None 97 | if closure is not None: 98 | loss = closure() 99 | 100 | for group in self.param_groups: 101 | for p in group['params']: 102 | if p.grad is None: 103 | continue 104 | grad = p.grad.data 105 | if grad.is_sparse: 106 | raise RuntimeError( 107 | 'diffGrad does not support sparse gradients, please consider SparseAdam instead') 108 | 109 | state = self.state[p] 110 | 111 | # State initialization 112 | if len(state) == 0: 113 | state['step'] = 0 114 | # Exponential moving average of gradient values 115 | state['exp_avg'] = torch.zeros_like(p.data) 116 | # Exponential moving average of squared gradient values 117 | state['exp_avg_sq'] = torch.zeros_like(p.data) 118 | # Previous gradient 119 | state['previous_grad'] = torch.zeros_like(p.data) 120 | 121 | exp_avg, exp_avg_sq, previous_grad = state['exp_avg'], state['exp_avg_sq'], state['previous_grad'] 122 | beta1, beta2 = group['betas'] 123 | 124 | state['step'] += 1 125 | 126 | if group['weight_decay'] != 0: 127 | grad.add_(group['weight_decay'], p.data) 128 | 129 | # Decay the first and second moment running average coefficient 130 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 131 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 132 | denom = exp_avg_sq.sqrt().add_(group['eps']) 133 | 134 | bias_correction1 = 1 - beta1 ** state['step'] 135 | bias_correction2 = 1 - beta2 ** state['step'] 136 | 137 | # compute diffgrad coefficient (dfc) 138 | 139 | if self.version == 0: 140 | diff = abs(previous_grad - grad) 141 | elif self.version == 1: 142 | diff = previous_grad-grad 143 | elif self.version == 2: 144 | diff = .5*abs(previous_grad - grad) 145 | 146 | if self.version == 0 or self.version == 1: 147 | dfc = 1. / (1. + torch.exp(-diff)) 148 | elif self.version == 2: 149 | # DFC2 = 9/(1+e-(.5/g/)-4 #range .5,5 150 | dfc = 9. / (1. + torch.exp(-diff))-4 151 | 152 | state['previous_grad'] = grad 153 | 154 | # update momentum with dfc 155 | exp_avg1 = exp_avg * dfc 156 | 157 | step_size = group['lr'] * \ 158 | math.sqrt(bias_correction2) / bias_correction1 159 | 160 | p.data.addcdiv_(-step_size, exp_avg1, denom) 161 | 162 | return loss 163 | 164 | 165 | class LookaheadDiffGrad(torch.optim.Optimizer): 166 | def __init__(self, 167 | params, 168 | lr=1e-3, 169 | betas=(0.9, 0.999), 170 | eps=1e-8, 171 | weight_decay=0, 172 | alpha=0.5, 173 | k=6): 174 | 175 | if not 0.0 <= alpha <= 1.0: 176 | raise ValueError(f'Invalid slow update rate: {alpha}') 177 | if not 1 <= k: 178 | raise ValueError(f'Invalid lookahead steps: {k}') 179 | 180 | base_optimizer = DiffGrad( 181 | params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 182 | self.buffer = [[None, None, None] for ind in range(10)] 183 | self.optimizer = base_optimizer 184 | self.param_groups = self.optimizer.param_groups 185 | self.alpha = alpha 186 | self.k = k 187 | for group in self.param_groups: 188 | group["step_counter"] = 0 189 | self.slow_weights = [[p.clone().detach() for p in group['params']] 190 | for group in self.param_groups] 191 | 192 | for w in it.chain(*self.slow_weights): 193 | w.requires_grad = False 194 | 195 | self.state = base_optimizer.state 196 | 197 | def step(self, closure=None): 198 | loss = None 199 | if closure is not None: 200 | loss = closure() 201 | loss = self.optimizer.step() 202 | for group, slow_weights in zip(self.param_groups, self.slow_weights): 203 | group['step_counter'] += 1 204 | if group['step_counter'] % self.k != 0: 205 | continue 206 | for p, q in zip(group['params'], slow_weights): 207 | if p.grad is None: 208 | continue 209 | q.data.add_(self.alpha, p.data - q.data) 210 | p.data.copy_(q.data) 211 | self.state = self.optimizer.state 212 | return loss 213 | 214 | 215 | class RAdam(torch.optim.Optimizer): 216 | # from https://github.com/LiyuanLucasLiu/RAdam/blob/master/radam.py 217 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): 218 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 219 | self.buffer = [[None, None, None] for ind in range(10)] 220 | super(RAdam, self).__init__(params, defaults) 221 | 222 | def __setstate__(self, state): 223 | super(RAdam, self).__setstate__(state) 224 | 225 | def step(self, closure=None): 226 | 227 | loss = None 228 | if closure is not None: 229 | loss = closure() 230 | 231 | for group in self.param_groups: 232 | 233 | for p in group['params']: 234 | if p.grad is None: 235 | continue 236 | grad = p.grad.data.float() 237 | if grad.is_sparse: 238 | raise RuntimeError( 239 | 'RAdam does not support sparse gradients') 240 | 241 | p_data_fp32 = p.data.float() 242 | 243 | state = self.state[p] 244 | 245 | if len(state) == 0: 246 | state['step'] = 0 247 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 248 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 249 | else: 250 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 251 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as( 252 | p_data_fp32) 253 | 254 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 255 | beta1, beta2 = group['betas'] 256 | 257 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 258 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 259 | 260 | state['step'] += 1 261 | buffered = self.buffer[int(state['step'] % 10)] 262 | if state['step'] == buffered[0]: 263 | N_sma, step_size = buffered[1], buffered[2] 264 | else: 265 | buffered[0] = state['step'] 266 | beta2_t = beta2 ** state['step'] 267 | N_sma_max = 2 / (1 - beta2) - 1 268 | N_sma = N_sma_max - 2 * \ 269 | state['step'] * beta2_t / (1 - beta2_t) 270 | buffered[1] = N_sma 271 | 272 | # more conservative since it's an approximated value 273 | if N_sma >= 5: 274 | step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * ( 275 | N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) 276 | else: 277 | step_size = 1.0 / (1 - beta1 ** state['step']) 278 | buffered[2] = step_size 279 | 280 | if group['weight_decay'] != 0: 281 | p_data_fp32.add_(-group['weight_decay'] 282 | * group['lr'], p_data_fp32) 283 | 284 | # more conservative since it's an approximated value 285 | if N_sma >= 5: 286 | denom = exp_avg_sq.sqrt().add_(group['eps']) 287 | p_data_fp32.addcdiv_(-step_size * 288 | group['lr'], exp_avg, denom) 289 | else: 290 | p_data_fp32.add_(-step_size * group['lr'], exp_avg) 291 | 292 | p.data.copy_(p_data_fp32) 293 | 294 | return loss 295 | 296 | 297 | class LookaheadRAdam(torch.optim.Optimizer): 298 | def __init__(self, 299 | params, 300 | lr=1e-3, 301 | betas=(0.9, 0.999), 302 | eps=1e-8, 303 | weight_decay=0, 304 | alpha=0.5, 305 | k=6): 306 | 307 | if not 0.0 <= alpha <= 1.0: 308 | raise ValueError(f'Invalid slow update rate: {alpha}') 309 | if not 1 <= k: 310 | raise ValueError(f'Invalid lookahead steps: {k}') 311 | 312 | base_optimizer = RAdam(params, lr=lr, betas=betas, 313 | eps=eps, weight_decay=weight_decay) 314 | self.buffer = [[None, None, None] for ind in range(10)] 315 | self.optimizer = base_optimizer 316 | self.param_groups = self.optimizer.param_groups 317 | self.alpha = alpha 318 | self.k = k 319 | for group in self.param_groups: 320 | group["step_counter"] = 0 321 | self.slow_weights = [[p.clone().detach() for p in group['params']] 322 | for group in self.param_groups] 323 | 324 | for w in it.chain(*self.slow_weights): 325 | w.requires_grad = False 326 | 327 | self.state = base_optimizer.state 328 | 329 | def step(self, closure=None): 330 | loss = None 331 | if closure is not None: 332 | loss = closure() 333 | loss = self.optimizer.step() 334 | for group, slow_weights in zip(self.param_groups, self.slow_weights): 335 | group['step_counter'] += 1 336 | if group['step_counter'] % self.k != 0: 337 | continue 338 | for p, q in zip(group['params'], slow_weights): 339 | if p.grad is None: 340 | continue 341 | q.data.add_(self.alpha, p.data - q.data) 342 | p.data.copy_(q.data) 343 | self.state = self.optimizer.state 344 | -------------------------------------------------------------------------------- /aimlutils/echo/README.md: -------------------------------------------------------------------------------- 1 | # **E**arth **C**omputing **H**yperparameter **O**ptimization (ECHO): A distributed hyperparameter optimization package build with Optuna 2 | 3 | ### Usage 4 | 5 | Run the hyperparameter optimization script: 6 | ```python 7 | python optimize.py hyperparameters.yml model_config.yml 8 | ``` 9 | Run the report script to get a dataframe of the results saved in the study: 10 | ```python 11 | python report.py hyperparameters.yml [-p plot_config.yml] 12 | ``` 13 | ### Dependencies 14 | 15 | There are three files that must be supplied to use the optimize script: 16 | 17 | * A custom objective class that trains your model and returns the metric to be optimized. 18 | 19 | * A configuration file specifying the hyperparameter optimization settings. 20 | 21 | * A model configuration file that contains the information needed to train your model (see examples in the holodec and gecko projects). 22 | 23 | ### Custom objective class 24 | 25 | The custom **Objective** class (objective.py) must inherit a **BaseObjective** class (which lives in base_objective.py), and must contain a method named **train** that returns the value of the optimization metric (in a dictionary, see below). There are example objective scripts for both torch and Keras in the examples directory. Your custom Objective class will inherit all of the methods and attributes from the BaseObjective. The Objective's train does not depend on the machine learning library used! For example, a simple template has the following structure: 26 | 27 | ```python 28 | from aimlutils.echo.src.base_objective import * 29 | from aimlutils.echo.src.pruning import KerasPruningCallback 30 | 31 | class Objective(BaseObjective): 32 | 33 | def __init__(self, config, metric = "val_loss", device = "cpu"): 34 | 35 | # Initialize the base class 36 | BaseObjective.__init__(self, config, metric, device) 37 | 38 | def train(self, trial, conf): 39 | 40 | # Make any custom edits to the model conf before using it to train a model. 41 | conf = custom_updates(trial, conf) 42 | 43 | ... (load data sets, build model, etc) 44 | 45 | callbacks = [KerasPruningCallback(trial, self.metric, interval = 1)] 46 | result = Model.fit(..., callbacks = callbacks) 47 | 48 | results_dictionary = { 49 | "val_loss": result["val_loss"], 50 | "loss": result["loss"], 51 | ... 52 | "val_accuracy": result["val_accuracy"] 53 | } 54 | return results_dictionary 55 | ``` 56 | You can have as many inputs to your custom Objective as needed, as long as those that are required to initialize the base class are included. The Objective class will call the train method from the inherited thunder **__call__** method, and will finish up by calling the inherited save method that writes the metric(s) details to disk. Note that, due to the inheritance of the one class on the other, you do not have to supply these two methods, as they are in pre-coded in the base class. You can customize them at your leisure using overriding methods in your custom Objective. Check out the scripts base_objective.py and run.py to see how things are structured and called. 57 | 58 | As noted, the metric used to toggle the model's training performance must be in the results dictionary. Other metrics that the user may want to track will be saved to disk if they are included in the results dictionary (the keys of the dictionary are used to name the columns in a pandas dataframe). See the example above where several metrics are being returned. 59 | 60 | Note that the first line in the train method states that any custom changes to the model configuration (conf) must be done here. If custom changes are required, the user may supply a method named **custom_updates** in addition to the Objective class (you may save both in the same script, or import the method from somewhere else in your custom Objective script). See also the section **Custom model configuration updates** below for an example. 61 | 62 | Finally, if using Keras, you need to include the (customized) KerasPruningCallback that will allow optuna to terminate unpromising trials. We do something similar when using torch -- see the examples directory. 63 | 64 | ### Hyperparameter optimizer configuration 65 | 66 | There are several fields: log, slurm, pbs, optuna, and variable subfields within each field. The log field allows us to save a file for printing messages and warnings that are placed in areas throughout the package. The slurm/pbs fields allows the user to specify how many GPU nodes should be used, and supports any slurm setting. The optuna field allows the user to configure the optimization procedure, including specifying which parameters will be used, as well as the performance metric. For example, consider the configuration settings: 67 | 68 | ```yaml 69 | pbs: 70 | jobs: 10 71 | kernel: "ncar_pylib /glade/work/schreck/py37" 72 | bash: ["module load ncarenv/1.3 gnu/8.3.0 openmpi/3.1.4 python/3.7.5 cuda/10.1"] 73 | batch: 74 | l: ["select=1:ncpus=8:ngpus=1:mem=128GB", "walltime=12:00:00"] 75 | A: "NAML0001" 76 | q: "casper" 77 | N: "echo_trial" 78 | o: "echo_trial.out" 79 | e: "echo_trial.err" 80 | slurm: 81 | jobs: 15 82 | kernel: "ncar_pylib /glade/work/schreck/py37" 83 | bash: ["module load ncarenv/1.3 gnu/8.3.0 openmpi/3.1.4 python/3.7.5 cuda/10.1"] 84 | batch: 85 | account: "NAML0001" 86 | gres: "gpu:v100:1" 87 | mem: "128G" 88 | n: 8 89 | t: "12:00:00" 90 | J: "echo_trial" 91 | o: "echo_trial.out" 92 | e: "echo_trial.err" 93 | optuna: 94 | study_name: "holodec_optimization" 95 | storage: "sqlite:///path/to/data/storage.db 96 | reload: 0 97 | objective: "examples/torch/objective.py" 98 | metric: "val_loss" 99 | direction: "minimize" 100 | n_trials: 500 101 | gpu: True 102 | save_path: 'test' 103 | sampler: 104 | type: "TPESampler" 105 | n_startup_trials: 30 106 | parameters: 107 | num_dense: 108 | type: "int" 109 | settings: 110 | name: "num_dense" 111 | low: 0 112 | high: 10 113 | dropout: 114 | type: "float" 115 | settings: 116 | name: "dr" 117 | low: 0.0 118 | high: 0.5 119 | optimizer:learning_rate: 120 | type: "loguniform" 121 | settings: 122 | name: "lr" 123 | low: 0.0000001 124 | high: 0.01 125 | model:activation: 126 | type: "categorical" 127 | settings: 128 | name: "activation" 129 | choices: ["relu", "linear", "leaky", "elu", "prelu"] 130 | log [optional]: 131 | save_path: "path/to/data/log.txt" 132 | ``` 133 | 134 | The subfields within "pbs" and slurm" should mostly be familiar to you. In this example there would be 10 jobs submitted to pbs queue and 15 jobs to the slurm queue. The kernel field is optional and can be any call(s) to activate a conda/python/ncar_pylib/etc environment. Additional snippets that you might need in your launch script can be added to the list in the "bash" field. For example, as in the example above, loading modules before training a model is required. Note that the bash options will be run in order, and before the kernel field. Remove or leave the kernel field blank if you do not need it. 135 | 136 | The subfields within the "optuna" field have the following functionality: 137 | 138 | * study_name: The name of the study. 139 | * storage: sqlite or mysql destination. 140 | * reload: Whether to continue using a previous study (True) or to initialize a new study (False). If your initial number of workers do not reach the number of trials and you wish to resubmit, set to True. 141 | * objective: The path to the user-supplied objective class (it must be named objective.py) 142 | * metric: The metric to be used to determine the model performance. 143 | * direction: Indicates which direction the metric must go to represent improvement (pick from maximimize or minimize) 144 | * n_trials: The number of trials in the study. 145 | * gpu: Use the gpu or cpu. 146 | * save_path: Directory path where data will be saved. 147 | * sampler 148 | + type: Choose how optuna will do parameter estimation. The default choice both here and in optuna is the [Tree-structured Parzen Estimator Approach](https://towardsdatascience.com/a-conceptual-explanation-of-bayesian-model-based-hyperparameter-optimization-for-machine-learning-b8172278050f), [e.g. TPESampler](https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf). See the optuna documentation for the different options. For some samplers (e.g. GridSearch) additional fields may be included (e.g. search_space). 149 | * parameters 150 | + type: Option to select an optuna trial setting. See the [optuna Trial documentation](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html?highlight=suggest#optuna.trial.Trial.suggest_uniform) for what is available. Currently, this package supports the available options from optuna: "categorical", "discrete_uniform", "float", "int", "loguniform", and "uniform". 151 | + settings: This dictionary field allows you to specify any settings that accompany the optuna trial type. In the example above, the named num_dense parameter is stated to be an integer with values ranging from 0 to 10. To see all the available options, consolt the [optuna Trial documentation](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html?highlight=suggest#optuna.trial.Trial.suggest_uniform) 152 | 153 | Lastly, the "log" field allows you to save the logging details to file; they will always be printed to stdout. If this field is removed, logging details will only be printed to stdout. 154 | 155 | ### Model configuration 156 | 157 | The model configuration file can be what you had been using up to this point to train your model, in other words no changes are necessary. This package will take the suggested hyperparameters from an optuna trial and make changes to the model configuration on the fly. This can either be done automatically with this package, or the user may supply an additional method for making custom changes. For example, consider the (truncated) configuration for training a model to predict hologram properties with a holodec dataset: 158 | 159 | ```yaml 160 | model: 161 | image_channels: 1 162 | hidden_dims: [3, 94, 141, 471, 425, 1122] 163 | z_dim: 1277 164 | dense_hidden_dims: [1000] 165 | dense_dropouts: [0.0] 166 | tasks: ["x", "y", "z", "d", "binary"] 167 | activation: "relu" 168 | optimizer: 169 | type: "lookahead-diffgrad" 170 | learning_rate: 0.000631 171 | weight_decay: 0.0 172 | trainer: 173 | start_epoch: 0 174 | epochs: 1 175 | clip: 1.0 176 | alpha: 1.0 177 | beta: 0.1 178 | path_save: "test" 179 | ``` 180 | The model configuration will be automatically updated if and only if the name of the parameter specified in the hyperparameter configuration, optuna.parameters can be used as a nested lookup key in the model configuration file. For example, observe in the hyperparameter configuration file above that the named parameter **optimizer:learning_rate** contains a colon, that is downstream used to split the named parameter into multiple keys that allow us to, starting from the top of the nested tree in the model configuration file, work our way down until the relevant field is located and the trial-suggested value is substituted in. In this example, the split keys are ["optimizer", "learning_rate"]. 181 | 182 | This scheme will work in general as long as the named parameter in optuna.parameters uses : as the separator, and once split, the resulting list can be used to locate the relevant field in the model configuration. 183 | 184 | Note that optuna has a limited range of trial parameters types; all but one them being numerical in one form or another. If you wanted to optimize the activation layer(s) in your neural network as in the example above, you could go about that by utilizing the "categorical" trial suggestor. For example, the following list of activation layer names could be specified: ["relu", "linear", "leaky", "elu", "prelu"]. 185 | 186 | 187 | ### Custom model configuration updates 188 | 189 | You may additionally supply rules for updating the model configuration file, by including a method named **custom_updates**, which will make the desired changes to the configuration file with optuna trail parameter guesses. 190 | 191 | In the example configurations described above, the hyperparameter configuration contained an optuna.parameters field "num_dense," but this field is not present in the model configuration. There is however a "dense_hiddden_dims" field in the model configuration that contains a list of the layer sizes in the model, where the number of layers is the length of the list. In our example just one layer specified but we want to vary that number. 192 | 193 | To use the "num_dense" hyperparameter from the hyperparameter configuration file, we can create the following method: 194 | 195 | ```python 196 | def custom_updates(trial, conf): 197 | 198 | # Get list of hyperparameters from the config 199 | hyperparameters = conf["optuna"]["parameters"] 200 | 201 | # Now update some via custom rules 202 | num_dense = trial.suggest_discrete_uniform(**hyperparameters["num_dense"]) 203 | 204 | # Update the config based on optuna's suggestion 205 | conf["model"]["dense_hidden_dims"] = [1000 for k in range(num_dense)] 206 | 207 | return conf 208 | ``` 209 | 210 | The method should be called first thing in the custom Objective.train method (see the example Objective above). You may have noticed that the configuration (named conf) contains both hyperparameter and model fields. This package will copy the hyperparameter optuna field to the model configuration for convenience, so that we can reduce the total number of class and method dependencies (which helps me keep the code generalized). This occurs in the run.py script. 211 | 212 | ### Custom plot settings for report.py 213 | 214 | The script report.py will load the current study, identify the best trial in the study, and will compute the relative importantance of each parameter using both fanova and MDI (see [here](https://optuna.readthedocs.io/en/v1.3.0/reference/importance.html) for details). 215 | 216 | Additionally, the script will create two figures, an optimization history plot and an intermediate values plot. If your metric returns two values to be optimized, only the pareto front plot will be generated. See the [documentation](https://optuna.readthedocs.io/en/v1.3.0/reference/visualization.html) for details on the plots. 217 | 218 | Note that ECHO only supports the [matplotlib](https://optuna.readthedocs.io/en/latest/reference/visualization/matplotlib.html) generated plots from Optuna, for now. Optuna's default is to use plot.ly, however not all LTS Jupyter-lab environments support that backend. 219 | 220 | The user may customize the plots to a degree, by additionally supplying a plot configuration yaml file (named plot_config.yml above, and called as an optional argument using the parser -p or --plot). Currently, the user may only adjust the rcParam backend variables (see [here](https://matplotlib.org/3.3.3/tutorials/introductory/customizing.html) for a comprehensive list) plus a limited set of other variables (see below), 221 | 222 | ```yaml 223 | optimization_history: 224 | save_path: '/glade/work/schreck/repos/holodec-ml/scripts/schreck/decoder/results/opt_multi_particle' 225 | set_xlim: [0, 100] 226 | set_ylim: [3e4, 1e6] 227 | set_xscale: "log" 228 | set_yscale: "log" 229 | rcparams: 230 | 'backend': 'ps' 231 | 'lines.markersize' : 4 232 | 'axes.labelsize': 10 233 | 'legend.fontsize': 10 234 | 'xtick.labelsize': 10 235 | 'ytick.labelsize': 10 236 | 'xtick.top': True 237 | 'xtick.bottom': True 238 | 'ytick.right': True 239 | 'ytick.left': True 240 | 'xtick.direction': 'in' 241 | 'ytick.direction': 'in' 242 | 'font.serif' : 'Helvetica' 243 | 'figure.dpi' : 600 244 | 'figure.autolayout': True 245 | 'legend.numpoints' : 1 246 | 'legend.handlelength' : 1.0 247 | 'legend.columnspacing' : 1.0 248 | ``` 249 | 250 | For the other supported plots, simply add or change "optimization_history" to "intermediate_values", or if optimizing more than one metric, "pareto_front". 251 | -------------------------------------------------------------------------------- /aimlutils/echo/optimize.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | 4 | import os 5 | import sys 6 | import yaml 7 | import optuna 8 | import logging 9 | import subprocess 10 | from argparse import ArgumentParser 11 | from aimlutils.echo.src.samplers import samplers 12 | from typing import Dict 13 | 14 | 15 | def args(): 16 | parser = ArgumentParser(description= 17 | "ECHO: A distributed multi-gpu hyperparameter optimization package build with Optuna" 18 | ) 19 | 20 | parser.add_argument("hyperparameter", type=str, help= 21 | "Path to the hyperparameter configuration containing your inputs." 22 | ) 23 | 24 | parser.add_argument("model", type=str, help= 25 | "Path to the model configuration containing your inputs." 26 | ) 27 | parser.add_argument( 28 | "-n", 29 | "--study_name", 30 | dest="study_name", 31 | type=str, 32 | default=False, 33 | help="The name of the study" 34 | ) 35 | parser.add_argument( 36 | "--override", 37 | dest="override", 38 | type=bool, 39 | default=False, 40 | help="Force remove the study name from the storage" 41 | ) 42 | parser.add_argument( 43 | "-r", 44 | "--reload", 45 | dest="reload", 46 | type=str, 47 | default=False, 48 | help="Set = 0 to initiate a new study, = 1 to continue a study" 49 | ) 50 | parser.add_argument( 51 | "-o", 52 | "--objective", 53 | dest="objective", 54 | type=str, 55 | default=False, 56 | help="Path to the supplied objective class" 57 | ) 58 | parser.add_argument( 59 | "-d", 60 | "--direction", 61 | dest="direction", 62 | type=str, 63 | default=False, 64 | help="Direction of the metric. Choose from maximize or minimize" 65 | ) 66 | parser.add_argument( 67 | "-m", 68 | "--metric", 69 | dest="metric", 70 | type=str, 71 | default=False, 72 | help="The validation metric" 73 | ) 74 | parser.add_argument( 75 | "-t", 76 | "--trials", 77 | dest="n_trials", 78 | type=str, 79 | default=False, 80 | help="The number of trials in the study" 81 | ) 82 | parser.add_argument( 83 | "-g", 84 | "--gpu", 85 | dest="gpu", 86 | type=str, 87 | default=False, 88 | help="Use the gpu or not (bool)" 89 | ) 90 | parser.add_argument( 91 | "-s", 92 | "--save_path", 93 | dest="save_path", 94 | type=str, 95 | default=False, 96 | help="Path to the save directory" 97 | ) 98 | parser.add_argument( 99 | "-c", 100 | "--create_study", 101 | dest="create_study", 102 | type=str, 103 | default=False, 104 | help="Create a study but do not submit any workers" 105 | ) 106 | return vars(parser.parse_args()) 107 | 108 | 109 | def fix_broken_study(_study: optuna.study.Study, 110 | name: str, 111 | storage: str, 112 | direction: str, 113 | sampler: optuna.samplers.BaseSampler): 114 | 115 | """ 116 | This method removes broken trials, which are those 117 | that failed to complete 1 epoch before slurm (or something else) killed the job 118 | and returned NAN or NONE. 119 | 120 | Failure to remove these trails leads to a error when optuna tries to update the 121 | parameters. This is because these trails only have "NoneType" data associated 122 | with them, but we need numerical data (e.g. the loss value) to update parameters. 123 | """ 124 | 125 | if len(_study.trials) == 0: 126 | return _study, [] 127 | 128 | trials = [] 129 | removed = [] 130 | for trial in _study.trials: 131 | if len(trial.intermediate_values) == 0: 132 | trials.append(trial) 133 | continue 134 | step, intermediate_value = max(trial.intermediate_values.items()) 135 | if intermediate_value is not None: 136 | trials.append(trial) 137 | else: 138 | removed.append(trial.number+1) 139 | 140 | if len(removed) == 0: 141 | return _study, [] 142 | 143 | # Delete the current study 144 | optuna.delete_study(study_name=name, storage=storage) 145 | 146 | # Create a new one in its place 147 | if isinstance(direction, str): 148 | study_fixed = optuna.create_study(study_name=name, 149 | storage=storage, 150 | direction=direction, 151 | sampler=sampler, 152 | load_if_exists=False) 153 | else: 154 | study_fixed = optuna.multi_objective.create_study( 155 | study_name=name, 156 | storage=storage, 157 | directions=direction, 158 | sampler=sampler, 159 | load_if_exists=False 160 | ) 161 | 162 | # Add the working trials to the new study 163 | for trial in trials: 164 | study_fixed.add_trial(trial) 165 | 166 | return study_fixed, removed 167 | 168 | 169 | def prepare_slurm_launch_script(hyper_config: str, 170 | model_config: str): 171 | 172 | slurm_options = ["#!/bin/bash -l"] 173 | slurm_options += [ 174 | f"#SBATCH -{arg} {val}" if len(arg) == 1 else f"#SBATCH --{arg}={val}" 175 | for arg, val in hyper_config["slurm"]["batch"].items() 176 | ] 177 | if "bash" in hyper_config["slurm"]: 178 | if len(hyper_config["slurm"]["bash"]) > 0: 179 | for line in hyper_config["slurm"]["bash"]: 180 | slurm_options.append(line) 181 | if "kernel" in hyper_config["slurm"]: 182 | if hyper_config["slurm"]["kernel"] is not None: 183 | slurm_options.append(f'{hyper_config["slurm"]["kernel"]}') 184 | import aimlutils.echo as opt 185 | aiml_path = os.path.join( 186 | os.path.abspath(opt.__file__).strip("__init__.py"), 187 | "run.py" 188 | ) 189 | slurm_options.append(f"python {aiml_path} {sys.argv[1]} {sys.argv[2]}") 190 | return slurm_options 191 | 192 | 193 | def prepare_pbs_launch_script(hyper_config: str, 194 | model_config: str): 195 | 196 | pbs_options = ["#!/bin/bash -l"] 197 | for arg, val in hyper_config["pbs"]["batch"].items(): 198 | if arg == "l" and type(val) == list: 199 | for opt in val: 200 | pbs_options.append(f"#PBS -{arg} {opt}") 201 | elif len(arg) == 1: 202 | pbs_options.append(f"#PBS -{arg} {val}") 203 | else: 204 | pbs_options.append(f"#PBS --{arg}={val}") 205 | if "bash" in hyper_config["pbs"]: 206 | if len(hyper_config["pbs"]["bash"]) > 0: 207 | for line in hyper_config["pbs"]["bash"]: 208 | pbs_options.append(line) 209 | if "kernel" in hyper_config["pbs"]: 210 | if hyper_config["pbs"]["kernel"] is not None: 211 | pbs_options.append(f'{hyper_config["pbs"]["kernel"]}') 212 | import aimlutils.echo as opt 213 | aiml_path = os.path.join( 214 | os.path.abspath(opt.__file__).strip("__init__.py"), 215 | "run.py" 216 | ) 217 | pbs_options.append(f"python {aiml_path} {sys.argv[1]} {sys.argv[2]}") 218 | return pbs_options 219 | 220 | 221 | def configuration_report(_dict: Dict[str, str], 222 | path: bool = None): 223 | 224 | if path is None: 225 | path = [] 226 | for k,v in _dict.items(): 227 | newpath = path + [k] 228 | if isinstance(v, dict): 229 | for u in configuration_report(v, newpath): 230 | yield u 231 | else: 232 | yield newpath, v 233 | 234 | 235 | if __name__ == "__main__": 236 | 237 | args_dict = args() 238 | 239 | hyper_config = args_dict.pop("hyperparameter") 240 | model_config = args_dict.pop("model") 241 | 242 | if not hyper_config or not model_config: 243 | raise OSError( 244 | "Usage: python main.py hyperparameter.yml model.yml [optional parser options]" 245 | ) 246 | 247 | if os.path.isfile(hyper_config): 248 | with open(hyper_config) as f: 249 | hyper_config = yaml.load(f, Loader=yaml.FullLoader) 250 | else: 251 | raise OSError( 252 | f"Hyperparameter optimization config file {sys.argv[1]} does not exist" 253 | ) 254 | 255 | if os.path.isfile(model_config): 256 | with open(model_config) as f: 257 | model_config = yaml.load(f, Loader=yaml.FullLoader) 258 | else: 259 | raise OSError( 260 | f"Model config file {sys.argv[1]} does not exist" 261 | ) 262 | 263 | # Set up a logger 264 | root = logging.getLogger() 265 | root.setLevel(logging.DEBUG) 266 | formatter = logging.Formatter('%(levelname)s:%(name)s:%(message)s') 267 | 268 | # Stream output to stdout 269 | ch = logging.StreamHandler() 270 | ch.setLevel(logging.INFO) 271 | ch.setFormatter(formatter) 272 | root.addHandler(ch) 273 | 274 | # Stream output to file 275 | if "log" in hyper_config: 276 | savepath = hyper_config["log"]["save_path"] if "save_path" in hyper_config["log"] else "log.txt" 277 | mode = "a+" if bool(hyper_config["optuna"]["reload"]) else "w" 278 | fh = logging.FileHandler(savepath, 279 | mode=mode, 280 | encoding='utf-8') 281 | fh.setLevel(logging.DEBUG) 282 | fh.setFormatter(formatter) 283 | root.addHandler(fh) 284 | 285 | # Override other options in hyperparameter config file, if supplied. 286 | for name, val in args_dict.items(): 287 | if val and (name in hyper_config): 288 | current_value = hyper_config["optuna"][name] 289 | logging.info( 290 | f"Overriding {name} in the hyperparameter configuration: {current_value} -> {val}" 291 | ) 292 | hyper_config["optuna"][name] = val 293 | 294 | # Print the configurations to the logger 295 | logging.info("Current hyperparameter configuration settings:") 296 | for p, v in configuration_report(hyper_config): 297 | full_path = ".".join([str(_p) for _p in p]) 298 | logging.info(f"{full_path}: {v}") 299 | logging.info("Current model configuration settings:") 300 | for p, v in configuration_report(model_config): 301 | full_path = ".".join([str(_p) for _p in p]) 302 | logging.info(f"{full_path}: {v}") 303 | 304 | # Set up new db entry if reload = 0 305 | reload_study = bool(hyper_config["optuna"]["reload"]) 306 | 307 | # Check if save directory exists 308 | if not os.path.isdir(hyper_config["optuna"]["save_path"]): 309 | raise OSError( 310 | f'Create the save directory {hyper_config["optuna"]["save_path"]} and try again' 311 | ) 312 | 313 | study_name = hyper_config["optuna"]["study_name"] 314 | #path_to_study = os.path.join(hyper_config["optuna"]["save_path"], name) 315 | #storage = f"sqlite:///{path_to_study}" 316 | storage = hyper_config["optuna"]["storage"] 317 | direction = hyper_config["optuna"]["direction"] 318 | single_objective = isinstance(direction, str) 319 | 320 | # Initialize the sampler 321 | if "sampler" not in hyper_config["optuna"]: 322 | if single_objective: # single-objective 323 | sampler = optuna.samplers.TPESampler() 324 | else: # multi-objective equivalent of TPESampler 325 | sampler = optuna.multi_objective.samplers.MOTPEMultiObjectiveSampler() 326 | else: 327 | sampler = samplers(hyper_config["optuna"]["sampler"]) 328 | 329 | # Initiate a study for the first time 330 | if not reload_study: 331 | 332 | # Check the direction 333 | if isinstance(direction, list): 334 | for direc in direction: 335 | if direc not in ["maximize", "minimize"]: 336 | raise OSError( 337 | f"Optimizer direction {direc} not recognized. Choose from maximize or minimize" 338 | ) 339 | 340 | else: 341 | if direction not in ["maximize", "minimize"]: 342 | raise OSError( 343 | f"Optimizer direction {direction} not recognized. Choose from maximize or minimize" 344 | ) 345 | 346 | # Check if the study record already exists. 347 | try: 348 | optuna.load_study( 349 | study_name = study_name, 350 | storage = storage, 351 | #direction = direction, 352 | sampler = sampler 353 | ) 354 | except KeyError: # The study name was not in storage, can proceed 355 | pass 356 | 357 | except: 358 | if args_dict["override"]: 359 | message = f"Removing the study_name {study_name} that exists in storage {storage}." 360 | optuna.delete_study( 361 | study_name = study_name, 362 | storage = storage, 363 | direction = direction, 364 | sampler = sampler 365 | ) 366 | else: 367 | message = f"The study {study_name} already exists in storage and reload was False." 368 | message += f" Delete it from {storage}, and try again or rerun this script" 369 | message += f" with the flag: --override 1" 370 | raise OSError(message) 371 | 372 | # Create a new study in the storage object 373 | if single_objective: 374 | create_study = optuna.create_study( 375 | study_name = study_name, 376 | storage = storage, 377 | direction = direction, 378 | sampler = sampler 379 | ) 380 | else: 381 | create_study = optuna.multi_objective.study.create_study( 382 | study_name = study_name, 383 | storage = storage, 384 | directions = direction, 385 | sampler = sampler 386 | ) 387 | 388 | # Check to see if there are any broken trials 389 | else: 390 | #if not os.path.isfile(path_to_study): 391 | # raise OSError("Reload was true but the study does not yet exist. Set reload = 0 and try again.") 392 | 393 | logging.info( 394 | f"Checking the study for broken trials (those that did not complete 1 epoch before dying)" 395 | ) 396 | if single_objective: 397 | study = optuna.load_study( 398 | study_name = study_name, 399 | storage = storage, 400 | sampler = sampler 401 | ) 402 | else: 403 | study = optuna.multi_objective.study.load_study( 404 | study_name = study_name, 405 | storage = storage, 406 | sampler = sampler 407 | ) 408 | study, removed = fix_broken_study(study, study_name, storage, direction, sampler) 409 | 410 | if len(removed): 411 | logging.info( 412 | f"Removing problematic trials {removed}." 413 | ) 414 | else: 415 | logging.info("All trials check out!") 416 | 417 | 418 | # Override to create the database but skip submitting jobs. 419 | create_db_only = True if args_dict["create_study"] else False 420 | 421 | # Stop here if arg is defined -- intention is that you manually run run.py for debugging purposes 422 | if create_db_only: 423 | logging.info(f"Created study {study_name} located at {storage}. Exiting.") 424 | sys.exit() 425 | 426 | ############### 427 | # 428 | # SLURM SUPPORT 429 | # 430 | ############### 431 | 432 | # Prepare launch script 433 | if "slurm" in hyper_config: 434 | launch_script = prepare_slurm_launch_script(hyper_config, model_config) 435 | 436 | # Save the configured script 437 | script_path = hyper_config["optuna"]["save_path"] 438 | script_location = os.path.join(script_path, "launch_slurm.sh") 439 | with open(script_location, "w") as fid: 440 | for line in launch_script: 441 | fid.write(f"{line}\n") 442 | 443 | # Launch the slurm jobs 444 | job_ids = [] 445 | name_condition = "J" in hyper_config["slurm"]["batch"] 446 | slurm_job_name = hyper_config["slurm"]["batch"]["J"] if name_condition else "echo_trial" 447 | n_workers = hyper_config["slurm"]["jobs"] 448 | for worker in range(n_workers): 449 | w = subprocess.Popen( 450 | f"sbatch -J {slurm_job_name}_{worker} {script_location}", 451 | shell=True, 452 | stdout = subprocess.PIPE, 453 | stderr = subprocess.PIPE 454 | ).communicate() 455 | job_ids.append( 456 | w[0].decode("utf-8").strip("\n").split(" ")[-1] 457 | ) 458 | logging.info( 459 | f"Submitted slurm batch job {worker + 1}/{n_workers} with id {job_ids[-1]}" 460 | ) 461 | 462 | # Write the job ids to file for reference 463 | with open(os.path.join(script_path, "slurm_job_ids.txt"), "w") as fid: 464 | for line in job_ids: 465 | fid.write(f"{line}\n") 466 | 467 | ############### 468 | # 469 | # PBS SUPPORT 470 | # 471 | ############### 472 | 473 | if "pbs" in hyper_config: 474 | launch_script = prepare_pbs_launch_script(hyper_config, model_config) 475 | 476 | # Save the configured script 477 | script_path = hyper_config["optuna"]["save_path"] 478 | script_location = os.path.join(script_path, "launch_pbs.sh") 479 | with open(script_location, "w") as fid: 480 | for line in launch_script: 481 | fid.write(f"{line}\n") 482 | 483 | # Launch the slurm jobs 484 | job_ids = [] 485 | name_condition = "J" in hyper_config["pbs"]["batch"] 486 | slurm_job_name = hyper_config["pbs"]["batch"]["N"] if name_condition else "echo_trial" 487 | n_workers = hyper_config["pbs"]["jobs"] 488 | for worker in range(n_workers): 489 | w = subprocess.Popen( 490 | f"qsub -N {slurm_job_name}_{worker} {script_location}", 491 | shell=True, 492 | stdout = subprocess.PIPE, 493 | stderr = subprocess.PIPE 494 | ).communicate() 495 | job_ids.append( 496 | w[0].decode("utf-8").strip("\n") 497 | ) 498 | logging.info( 499 | f"Submitted pbs batch job {worker + 1}/{n_workers} with id {job_ids[-1]}" 500 | ) 501 | 502 | # Write the job ids to file for reference 503 | with open(os.path.join(script_path, "pbs_job_ids.txt"), "w") as fid: 504 | for line in job_ids: 505 | fid.write(f"{line}\n") 506 | -------------------------------------------------------------------------------- /blog/site/generators.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# (10/27/20) Methods, iterables, and generators for reading files" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "We all need to open, close, and save data to files using Python. Loading a file always involves reading the lines in the file, and possibly doing some processing on each line. An example file $\\textbf{test_data.txt}$ contains 1,000,000 rows (lines) and 3 columns separated by an empty space." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import time" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "fn = \"generators/test_data.txt\"" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "##### The Python Method" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 4, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "def method_loader(fn):\n", 49 | " with open(fn) as fid:\n", 50 | " lines = fid.readlines() # Read in lines until reaching EOF\n", 51 | " lines = [line.strip(\"\\n\").split(\" \")[-1] for line in lines] # Process each line\n", 52 | " return lines" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 5, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "for processed_line in method_loader(fn):\n", 62 | " break" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "The entire file was read into memory with the .readlines() call, before any line processing was performed." 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "Reading files like this shouldn't be a problem for file sizes up to ~1G. But sometimes we have no choice and have to work with large files (sometimes hundreds of gigs). As a result the readlines operation can take a very long time. Furthermore, if the file is too large to load into memory, python will throw the error __MemoryError__ and your program will terminate *with error*." 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "We frequently encounter large datafiles at NCAR. What can we do about it? \n", 84 | "\n", 85 | "Reading and processing one line at a time would solve this problem. We could even process an \"infinitely\" large file, which means any file that's too large to load fully into memory.\n", 86 | "\n", 87 | "This kind of file reading is called __lazy__ reading." 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "##### The Python Generator" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "There is a special tool in the python toolbox that easily enables lazy reading called a generator. The generator object is built on top of python's Iterator object class, but I will cover them in reverse below." 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 14, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "def generator_loader(fn):\n", 111 | " with open(fn, \"r\") as fid:\n", 112 | " for line in fid:\n", 113 | " yield line.strip(\"\\n\").split(\" \")[-1]" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "We've replaced the return with something new named $\\color{green}{\\textbf{yield}}$. This chunk of code looks similar to the method_loader!\n", 121 | "\n", 122 | "Test it." 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 53, 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "name": "stdout", 132 | "output_type": "stream", 133 | "text": [ 134 | "0\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "for processed_line in generator_loader(fn):\n", 140 | " print(processed_line)\n", 141 | " break # Stop early, I don't need to print 1,000,000 lines!" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "It behaves similarly when in use as compared to the method variant presented above. The big difference is that the generator version is more memory efficient because, through $\\color{green}{\\textbf{yield}}$, lines are read into memory one at a time, returned, released, ..., until reaching the end of the file (EOF). \n", 149 | "\n", 150 | "That is to say $\\color{green}{\\textbf{yield}}$ returns more than once, whereas $\\color{green}{\\textbf{return}}$ in a method signals the end (in terms of memory usage, as it is freed and the method is exited). " 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "Generators work nicely with serialized data ... you may have __*dumped*__ data using the pickle library before. The pickle library allows you to do that for the entire file, all in one go, or line-by-line as the example below illustrates:" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 16, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "import pickle" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 18, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "fn_pkl = \"generators/test_data.pkl\"" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 19, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "def write_to_pickle(data, fn):\n", 185 | " with open(fn, \"wb\") as fid:\n", 186 | " for line in data:\n", 187 | " pickle.dump(line, fid) # Iteration over .dump" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 21, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "write_to_pickle(\n", 197 | " method_loader(fn),\n", 198 | " fn_pkl\n", 199 | ")" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "From here will assume that we do not know how many lines are in our serialized data dump:" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 22, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "def load_from_pickle(fn):\n", 216 | " with open(fn, \"rb\") as fid:\n", 217 | " while True: # Keep looping with while.\n", 218 | " yield pickle.load(fid) # Iteration over .load" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "where the $\\color{green}{\\textbf{while True}}$ clause will keep looping over the call to load pickled data until we reach the end of the file. \n", 226 | "\n", 227 | "Test it!" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 23, 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "ename": "EOFError", 237 | "evalue": "Ran out of input", 238 | "output_type": "error", 239 | "traceback": [ 240 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 241 | "\u001b[0;31mEOFError\u001b[0m Traceback (most recent call last)", 242 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mload_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn_pkl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 243 | "\u001b[0;32m\u001b[0m in \u001b[0;36mload_from_pickle\u001b[0;34m(fn)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfid\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# Keep looping with while.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32myield\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfid\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# Iteration over .load\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 244 | "\u001b[0;31mEOFError\u001b[0m: Ran out of input" 245 | ] 246 | } 247 | ], 248 | "source": [ 249 | "for row in load_from_pickle(fn_pkl):\n", 250 | " continue" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "It failed! \n", 258 | "\n", 259 | "There must still be a signal that can be used to stop the geneartor from yielding the next line when it does not exist. \n", 260 | "\n", 261 | "Note that python threw an end-of-file error $\\color{red}{\\textbf{EOFError}}$. We can catch that and use it to exit the generator:" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 24, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "def load_from_pickle(fn):\n", 271 | " with open(fn, \"rb\") as fid:\n", 272 | " try:\n", 273 | " while True: # We do not necessarily know how many lines are in fn\n", 274 | " yield pickle.load(fid) \n", 275 | " except EOFError:\n", 276 | " pass # Do nothing and leave read_from_pickle without error " 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "Now it will run without error:" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 25, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "for row in load_from_pickle(fn_pkl):\n", 293 | " continue\n", 294 | " \n", 295 | "# Finishes without error" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "This is rather clunky! Now we have to do *exception handling* (gasp). You might be wondering what good are python generators at helping us to simplify memory usage when this style of coding makes the workflow more complex. We could have relied on python's Iterator objects (covered next), to do lazy reading." 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "Fortunatly, there is a generalized version of yield, $\\color{green}{\\textbf{yield from}}$ for these siuations:" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 26, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "def load_from_pickle(fn):\n", 319 | " with open(fn, \"rb\") as fid:\n", 320 | " yield from pickle.load(fid)" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 27, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "for row in load_from_pickle(fn_pkl):\n", 330 | " continue" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 28, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "# Finishes without error. " 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "metadata": {}, 345 | "source": [ 346 | "In short, generators allow us to write simple code that helps to simplfy memory usage when working with large data files." 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "##### The Python Iterable" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "Before generators were introduced, one relied on a python __Iterator__ object to produce lazy readers. Iterable classes are not too difficult to write, but they have dependencies, in particular, they must contain the $\\color{blue}{\\textbf{__iter__}}$ and $\\color{blue}{\\textbf{__next__}}$ \"thunder\" methods. \n", 361 | "\n", 362 | "A simple example with our serialized (pickled) data from above:" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 44, 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [ 371 | "class read_from_pickle_iterable:\n", 372 | " \n", 373 | " def __init__(self, fn):\n", 374 | " self.fn = fn\n", 375 | " self.fid = open(self.fn, \"rb\")\n", 376 | " \n", 377 | " def __iter__(self):\n", 378 | " return self\n", 379 | " \n", 380 | " def __next__(self):\n", 381 | " try:\n", 382 | " return pickle.load(self.fid)\n", 383 | " except EOFError:\n", 384 | " raise StopIteration" 385 | ] 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "metadata": {}, 390 | "source": [ 391 | "The thunder method $\\color{blue}{\\textbf{__iter__}}$ returns the object itself (through self!), while $\\color{blue}{\\textbf{__next__}}$ is used to return the result of the .load call on the opened file." 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 49, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [ 400 | "rfpi = read_from_pickle_iterable(fn_pkl)" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "metadata": {}, 406 | "source": [ 407 | "Using the Iterator's $\\color{blue}{\\textbf{__next__}}$ functionality, we then grab the lines from the file one-by-one without opening the entire file:" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 51, 413 | "metadata": {}, 414 | "outputs": [ 415 | { 416 | "ename": "StopIteration", 417 | "evalue": "", 418 | "output_type": "error", 419 | "traceback": [ 420 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 421 | "\u001b[0;31mEOFError\u001b[0m Traceback (most recent call last)", 422 | "\u001b[0;32m\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mEOFError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 423 | "\u001b[0;31mEOFError\u001b[0m: Ran out of input", 424 | "\nDuring handling of the above exception, another exception occurred:\n", 425 | "\u001b[0;31mStopIteration\u001b[0m Traceback (most recent call last)", 426 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrfpi\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# Use next like this\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 427 | "\u001b[0;32m\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mEOFError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 14\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 428 | "\u001b[0;31mStopIteration\u001b[0m: " 429 | ] 430 | } 431 | ], 432 | "source": [ 433 | "while True:\n", 434 | " next(rfpi) # Use next like this" 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": {}, 440 | "source": [ 441 | "which dies as intended when the $\\color{red}{\\textbf{EOFError}}$ and the $\\color{red}{\\textbf{StopException}}$ is thrown using the $\\color{green}{\\textbf{raise}}$ clause. When the iterator object is iterated out, for example rolled out in a for loop (or by list(), enumerate(), etc) it will exit without error:" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 52, 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "for line in read_from_pickle_iterable(fn_pkl):\n", 451 | " continue" 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "metadata": {}, 457 | "source": [ 458 | "### Why even have generators when there are already iterators?" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": {}, 464 | "source": [ 465 | "The answer is because generators are more compact and easier to write, as you do not have to explicitly sub-class them with the $\\color{blue}{\\textbf{__iter__}}$ and $\\color{blue}{\\textbf{__next__}}$. That's taken care of under-the-hood with the generator class. But the converse is not true: iterables do not have the yield capability." 466 | ] 467 | }, 468 | { 469 | "cell_type": "markdown", 470 | "metadata": {}, 471 | "source": [ 472 | "### When should I use a generator rather than a method?" 473 | ] 474 | }, 475 | { 476 | "cell_type": "markdown", 477 | "metadata": {}, 478 | "source": [ 479 | "There are lots of scenerios, in addition to data loading! Note that in the first method example above, a list was returned. Do you need all elements of the list, all at the same time? If the answer is no, then you want to try to use a generator if that file is large in size!\n", 480 | "\n", 481 | "With generators one tries to balance the time spent performing operations with/on the data with memory utilization. Using them will often be determined by how your program is designed to run and utilize resources. If you are presented with a significant memory bottleneck, generators are very often the way to go. However if your program does not have said memory issue, using a generator might result in the program running significantly slower. \n", 482 | "\n", 483 | "As you use generators more and more in your work-flow, you will learn to apply them when they are needed and when to avoid them when they do not offer any benefit over using methods." 484 | ] 485 | }, 486 | { 487 | "cell_type": "markdown", 488 | "metadata": {}, 489 | "source": [ 490 | "Feel free to email me (John Schreck, schreck@ucar.edu) with any questions / mistakes / whatever!" 491 | ] 492 | } 493 | ], 494 | "metadata": { 495 | "kernelspec": { 496 | "display_name": "Python 3", 497 | "language": "python", 498 | "name": "python3" 499 | }, 500 | "language_info": { 501 | "codemirror_mode": { 502 | "name": "ipython", 503 | "version": 3 504 | }, 505 | "file_extension": ".py", 506 | "mimetype": "text/x-python", 507 | "name": "python", 508 | "nbconvert_exporter": "python", 509 | "pygments_lexer": "ipython3", 510 | "version": "3.8.6" 511 | } 512 | }, 513 | "nbformat": 4, 514 | "nbformat_minor": 4 515 | } 516 | -------------------------------------------------------------------------------- /blog/site/slurm.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# (12/22/20) Helpful SLURM Commands\n", 8 | "David John Gagne\n", 9 | "\n", 10 | "SLURM is currently the scheduler on the Casper cluster, which means it is used to manage the queueing and scheduling of jobs. You are likely very familiar with sbatch and squeue at this point. SLURM also has a whole suite of other commands that give you an incredibly detailed view into the usage of the cluster by yourself and everyone else. This blog will provide some insights to help you better manage your own jobs and keep track of how busy Casper is." 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "## Track your job memory and CPU usage with sacct\n", 18 | "`sacct` queries the SLURM scheduler database to find out how well you or any other user has utilized their requested resources on a job by job basis. The default output of sacct is not very useful, but with a few alterations to the command, you can get a wealth of information.\n", 19 | "\n", 20 | "I recommend running sacct in the following format (Note that ! allows you to run a command line program within a notebook. Do not copy the ! if you want to use the command in the terminal window):" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": { 27 | "execution": { 28 | "iopub.execute_input": "2020-12-22T19:47:31.923723Z", 29 | "iopub.status.busy": "2020-12-22T19:47:31.923313Z", 30 | "iopub.status.idle": "2020-12-22T19:47:32.129176Z", 31 | "shell.execute_reply": "2020-12-22T19:47:32.128743Z" 32 | } 33 | }, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | " User JobName JobID AllocNodes ReqCPUS Elapsed CPUTime TotalCPU ReqMem MaxRSS ExitCode State \r\n", 40 | "--------- ---------- ------------ ---------- -------- ---------- ---------- ---------- ---------- ---------- -------- ---------- \r\n", 41 | " dgagne sfc 6249373 1 16 00:00:06 00:01:36 00:00.683 128Gn 1:0 FAILED \r\n", 42 | " batch 6249373.bat+ 1 16 00:00:06 00:01:36 00:00.682 128Gn 0 1:0 FAILED \r\n", 43 | " extern 6249373.ext+ 1 16 00:00:06 00:01:36 00:00:00 128Gn 0 0:0 COMPLETED \r\n", 44 | " dgagne sfc 6249377 1 16 00:00:39 00:10:24 00:12.017 128Gn 1:0 FAILED \r\n", 45 | " batch 6249377.bat+ 1 16 00:00:39 00:10:24 00:12.016 128Gn 0.35G 1:0 FAILED \r\n", 46 | " extern 6249377.ext+ 1 16 00:00:39 00:10:24 00:00.001 128Gn 0 0:0 COMPLETED \r\n", 47 | " dgagne sfc 6249380 1 16 00:00:23 00:06:08 00:11.773 128Gn 1:0 FAILED \r\n", 48 | " batch 6249380.bat+ 1 16 00:00:23 00:06:08 00:11.772 128Gn 0 1:0 FAILED \r\n", 49 | " extern 6249380.ext+ 1 16 00:00:23 00:06:08 00:00.001 128Gn 0 0:0 COMPLETED \r\n", 50 | " dgagne sfc 6249390 1 16 00:08:07 02:09:52 21:33.130 128Gn 0:0 COMPLETED \r\n", 51 | " batch 6249390.bat+ 1 16 00:08:07 02:09:52 21:33.129 128Gn 1.40G 0:0 COMPLETED \r\n", 52 | " extern 6249390.ext+ 1 16 00:08:08 02:10:08 00:00:00 128Gn 0 0:0 COMPLETED \r\n", 53 | " dgagne sfc 6250928 1 16 00:01:47 00:28:32 01:21.477 128Gn 1:0 FAILED \r\n", 54 | " batch 6250928.bat+ 1 16 00:01:47 00:28:32 01:21.476 128Gn 0.90G 1:0 FAILED \r\n", 55 | " extern 6250928.ext+ 1 16 00:01:47 00:28:32 00:00.001 128Gn 0 0:0 COMPLETED \r\n", 56 | " dgagne sfc 6250961 1 16 00:01:21 00:21:36 01:19.700 128Gn 1:0 FAILED \r\n", 57 | " batch 6250961.bat+ 1 16 00:01:21 00:21:36 01:19.699 128Gn 0.90G 1:0 FAILED \r\n", 58 | " extern 6250961.ext+ 1 16 00:01:21 00:21:36 00:00.001 128Gn 0 0:0 COMPLETED \r\n", 59 | " dgagne sfc 6251023 1 16 00:01:58 00:31:28 01:23.110 128Gn 1:0 FAILED \r\n", 60 | " batch 6251023.bat+ 1 16 00:01:58 00:31:28 01:23.109 128Gn 0.90G 1:0 FAILED \r\n", 61 | " extern 6251023.ext+ 1 16 00:01:58 00:31:28 00:00.001 128Gn 0 0:0 COMPLETED \r\n", 62 | " dgagne sfc 6251026 1 16 00:04:27 01:11:12 09:42.497 128Gn 1:0 FAILED \r\n", 63 | " batch 6251026.bat+ 1 16 00:04:27 01:11:12 09:42.496 128Gn 1.16G 1:0 FAILED \r\n", 64 | " extern 6251026.ext+ 1 16 00:04:27 01:11:12 00:00:00 128Gn 0 0:0 COMPLETED \r\n", 65 | " dgagne sfc 6257007 1 16 00:05:45 01:32:00 13:28.988 128Gn 1:0 FAILED \r\n", 66 | " batch 6257007.bat+ 1 16 00:05:45 01:32:00 13:28.987 128Gn 1.24G 1:0 FAILED \r\n", 67 | " extern 6257007.ext+ 1 16 00:05:45 01:32:00 00:00.001 128Gn 0 0:0 COMPLETED \r\n", 68 | " dgagne sfc 6257047 1 16 00:00:03 00:00:48 00:03.086 128Gn 1:0 FAILED \r\n", 69 | " batch 6257047.bat+ 1 16 00:00:03 00:00:48 00:03.085 128Gn 0 1:0 FAILED \r\n", 70 | " extern 6257047.ext+ 1 16 00:00:03 00:00:48 00:00:00 128Gn 0 0:0 COMPLETED \r\n", 71 | " dgagne casp_nb 6266629 1 12 06:00:00 3-00:00:00 00:22.151 256Gn 0:0 TIMEOUT \r\n", 72 | " batch 6266629.bat+ 1 12 06:00:01 3-00:00:12 00:22.150 256Gn 0.30G 0:15 CANCELLED \r\n", 73 | " extern 6266629.ext+ 1 12 06:00:00 3-00:00:00 00:00.001 256Gn 0 0:0 COMPLETED \r\n", 74 | " dgagne sfc 6295916 1 16 00:03:06 00:49:36 05:27.670 128Gn 1:0 FAILED \r\n", 75 | " batch 6295916.bat+ 1 16 00:03:06 00:49:36 05:27.669 128Gn 1.06G 1:0 FAILED \r\n", 76 | " extern 6295916.ext+ 1 16 00:03:06 00:49:36 00:00:00 128Gn 0 0:0 COMPLETED \r\n", 77 | " dgagne sfc 6295929 1 16 00:11:13 02:59:28 28:03.125 128Gn 0:0 COMPLETED \r\n", 78 | " batch 6295929.bat+ 1 16 00:11:13 02:59:28 28:03.124 128Gn 1.38G 0:0 COMPLETED \r\n", 79 | " extern 6295929.ext+ 1 16 00:11:13 02:59:28 00:00.001 128Gn 0 0:0 COMPLETED \r\n", 80 | " dgagne htrainrt 6316207 1 30 00:10:11 05:05:30 48:06.423 200Gn 0:0 COMPLETED \r\n", 81 | " batch 6316207.bat+ 1 30 00:10:11 05:05:30 48:06.422 200Gn 56.67G 0:0 COMPLETED \r\n", 82 | " extern 6316207.ext+ 1 30 00:10:11 05:05:30 00:00.001 200Gn 0 0:0 COMPLETED \r\n", 83 | " dgagne htrainrt 6316247 1 30 00:44:20 22:10:00 02:20:53 200Gn 0:0 COMPLETED \r\n", 84 | " batch 6316247.bat+ 1 30 00:44:20 22:10:00 02:20:52 200Gn 104.03G 0:0 COMPLETED \r\n", 85 | " extern 6316247.ext+ 1 30 00:44:21 22:10:30 00:01.001 200Gn 0.00G 0:0 COMPLETED \r\n", 86 | " dgagne casp_nb 6319681 1 4 00:00:33 00:02:12 00:00.286 64Gn 0:0 COMPLETED \r\n", 87 | " batch 6319681.bat+ 1 4 00:00:33 00:02:12 00:00.285 64Gn 0.05G 0:0 COMPLETED \r\n", 88 | " extern 6319681.ext+ 1 4 00:00:33 00:02:12 00:00.001 64Gn 0 0:0 COMPLETED \r\n", 89 | " dgagne casp_nb 6319684 1 8 00:00:32 00:04:16 00:00.280 64Gn 0:0 COMPLETED \r\n", 90 | " batch 6319684.bat+ 1 8 00:00:32 00:04:16 00:00.280 64Gn 0.05G 0:0 COMPLETED \r\n", 91 | " extern 6319684.ext+ 1 8 00:00:32 00:04:16 00:00:00 64Gn 0 0:0 COMPLETED \r\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "! sacct --units=G --format=\"User,JobName,JobID,AllocNodes,ReqCPUs,Elapsed,CPUTime,TotalCPU,ReqMem,MaxRSS,ExitCode,State\" -S 2020-12-01 -E 2020-12-31 -u dgagne" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "The command breaks down into these parts:\n", 104 | "- `--units=G`: Print all memory-related outputs in Gigabytes. You can also use M or K for Megabytes and Kilobytes\n", 105 | "- `--format=\"...\"`: The list of columns to output. The full list can be found [here](https://slurm.schedmd.com/sacct.html). \n", 106 | "- `-S 2020-12-01`: The start date for the query. Can be adjusted so only recent jobs are visible.\n", 107 | "- `-E 2020-12-31`: The end date for the query. \n", 108 | "- `-u dgagne`: The username. Can be a comma separated list of users, like `-u dgagne,cbecker,schreck,ggantos`\n", 109 | "\n", 110 | "What does the output mean? The most relevant comparisons relate to CPU and memory usage. \n", 111 | "- Elapsed: total time the job runs in Day-Hour:Minute:Second format.\n", 112 | "- CPUTime: total time the CPUs are allocated, which should be close to Elapsed * ReqCPUs. \n", 113 | "- TotalCPU: The total amount of time the CPUs are in use by the user or the system. If this is far less than CPUTime, then you are requesting too many CPUs for your job. Note that TotalCPU does not account for child processes, so if you are running multiprocessing or dask, this number may be deceptively low. \n", 114 | "\n", 115 | "For memory usage\n", 116 | "- ReqMem: The total amount of memory the job requested.\n", 117 | "- MaxRSS: The maximum amount of memory the job used. If MaxRSS is far less than ReqMem, then decrease future memory requests. If it is the same or close to the same as ReqMem and your job is taking a longer than expected time to run, the program may be swapping memory to disk. You should ask for more memory in that case. " 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "## Track current cluster usage with sinfo\n", 125 | "`sinfo` prints out information about the current usage of every node in the cluster. It is helpful to see which nodes have what resources, and you can see how busy each node is. This may be especially useful when you are about to launch a multi-GPU or large memory job and want to make sure the memory and GPUs are available. The default `sinfo` call provides a very high level summary. Just like `sacct`, I recommend running the following command:\n" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 2, 131 | "metadata": { 132 | "execution": { 133 | "iopub.execute_input": "2020-12-22T19:47:32.132826Z", 134 | "iopub.status.busy": "2020-12-22T19:47:32.132459Z", 135 | "iopub.status.idle": "2020-12-22T19:47:32.281862Z", 136 | "shell.execute_reply": "2020-12-22T19:47:32.281505Z" 137 | } 138 | }, 139 | "outputs": [ 140 | { 141 | "name": "stdout", 142 | "output_type": "stream", 143 | "text": [ 144 | "HOSTNAMES AVAIL_FEATURES CPUS CPU_LOAD GRES GRES_USED ALLOCMEM FREE_MEM STATE AVAIL \r\n", 145 | "casper23 casper,skylake,mlx5_0,gp100,gpu,x11 72 0.16 gpu:gp100:1 gpu:gp100:0(IDX:N/A),mps:0 0 342060 drained up \r\n", 146 | "casper20 casper,skylake,mlx5_0 72 0.56 (null) gpu:0,mps:0 0 295669 reserved up \r\n", 147 | "casper25 casper,skylake,mlx5_0,4xv100,v100,gpu 72 0.04 gpu:v100:4,mps:v100:400 gpu:v100:0(IDX:N/A),mps:v100:0(IDX:N/A) 0 734630 reserved up \r\n", 148 | "casper28 casper,skylake,mlx5_0,8xv100,v100,gpu 72 0.01 gpu:v100:8,mps:v100:800 gpu:v100:0(IDX:N/A),mps:v100:0(IDX:N/A) 0 1123240 reserved up \r\n", 149 | "casper01 casper,skylake,mlx5_0 72 1.39 (null) gpu:0,mps:0 247808 217182 mixed up \r\n", 150 | "casper02 casper,skylake,mlx5_0 72 0.62 (null) gpu:0,mps:0 380044 335151 mixed up \r\n", 151 | "casper03 casper,skylake,mlx5_0 72 17.92 (null) gpu:0,mps:0 382534 325258 mixed up \r\n", 152 | "casper04 casper,skylake,mlx5_0 72 2.29 (null) gpu:0,mps:0 379904 305334 mixed up \r\n", 153 | "casper05 casper,skylake,mlx5_0 72 18.23 (null) gpu:0,mps:0 374784 314173 mixed up \r\n", 154 | "casper06 casper,skylake,mlx5_0,gp100,gpu,x11 72 3.21 gpu:gp100:1 gpu:gp100:0(IDX:N/A),mps:0 355328 337188 mixed up \r\n", 155 | "casper07 casper,skylake,mlx5_0,gp100,gpu,x11 72 3.14 gpu:gp100:1 gpu:gp100:0(IDX:N/A),mps:0 310854 341183 mixed up \r\n", 156 | "casper09 casper,skylake,mlx5_0,4xv100,v100,gpu 72 3.14 gpu:v100:4,mps:v100:400 gpu:v100:4(IDX:0-3),mps:v100:0(IDX:N/A) 349452 649024 mixed up \r\n", 157 | "casper10 casper,skylake,mlx5_0 72 5.44 (null) gpu:0,mps:0 370968 332526 mixed up \r\n", 158 | "casper11 casper,skylake,mlx5_0 72 3.79 (null) gpu:0,mps:0 384140 336050 mixed up \r\n", 159 | "casper12 casper,skylake,mlx5_0 72 3.43 (null) gpu:0,mps:0 381952 325012 mixed up \r\n", 160 | "casper13 casper,skylake,mlx5_0 72 3.70 (null) gpu:0,mps:0 384582 333591 mixed up \r\n", 161 | "casper14 casper,skylake,mlx5_0,gp100,gpu,x11 72 3.27 gpu:gp100:1 gpu:gp100:0(IDX:N/A),mps:0 358982 342191 mixed up \r\n", 162 | "casper15 casper,skylake,mlx5_0,gp100,gpu,x11 72 2.29 gpu:gp100:1 gpu:gp100:0(IDX:N/A),mps:0 307200 341551 mixed up \r\n", 163 | "casper16 casper,skylake,mlx5_0,gp100,gpu,x11 72 1.20 gpu:gp100:1 gpu:gp100:0(IDX:N/A),mps:0 204800 344260 mixed up \r\n", 164 | "casper17 casper,skylake,mlx5_0,gp100,gpu,x11 72 1.28 gpu:gp100:1 gpu:gp100:0(IDX:N/A),mps:0 364544 343480 mixed up \r\n", 165 | "casper18 casper,skylake,mlx5_0 72 17.04 (null) gpu:0,mps:0 382608 248991 mixed up \r\n", 166 | "casper19 casper,skylake,mlx5_0 72 31.76 (null) gpu:0,mps:0 352150 259655 mixed up \r\n", 167 | "casper22 casper,skylake,mlx5_0,gp100,gpu,x11 72 3.19 gpu:gp100:1 gpu:gp100:0(IDX:N/A),mps:0 310854 336178 mixed up \r\n", 168 | "casper24 casper,skylake,mlx5_0,8xv100,v100,gpu 72 10.09 gpu:v100:8,mps:v100:800 gpu:v100:8(IDX:0-7),mps:v100:0(IDX:N/A) 256000 1053921 mixed up \r\n", 169 | "casper26 casper,skylake,mlx5_0,gp100,gpu,x11 72 3.68 gpu:gp100:1 gpu:gp100:0(IDX:N/A),mps:0 323142 296209 mixed up \r\n", 170 | "casper27 casper,skylake,mlx5_0,8xv100,v100,gpu 72 0.04 gpu:v100:8,mps:v100:800 gpu:v100:0(IDX:N/A),mps:v100:0(IDX:N/A) 307200 1115535 mixed up \r\n", 171 | "casper29 casper,cascadelake,mlx5_0,4xv100,v100,gpu 72 0.84 gpu:v100:4,mps:v100:400 gpu:v100:3(IDX:0-2),mps:v100:0(IDX:N/A) 277062 728882 mixed up \r\n", 172 | "casper30 casper,cascadelake,mlx5_0,8xv100,v100,gpu 72 8.40 gpu:v100:8,mps:v100:800 gpu:v100:8(IDX:0-7),mps:v100:0(IDX:N/A) 51200 1060850 mixed up \r\n", 173 | "casper31 casper,cascadelake,mlx5_0,8xv100,v100,gpu 72 8.32 gpu:v100:8,mps:v100:800 gpu:v100:8(IDX:0-7),mps:v100:0(IDX:N/A) 51200 1059902 mixed up \r\n", 174 | "casper36 casper,cascadelake,mlx5_0,4xv100,v100,gpu 72 52.14 gpu:v100:4,mps:v100:400 gpu:v100:2(IDX:0,2),mps:v100:0(IDX:N/A) 671744 641858 mixed up \r\n", 175 | "casper21 casper,skylake,mlx5_0 72 20.37 (null) gpu:0,mps:0 373532 161434 allocated up \r\n", 176 | "casper08 casper,skylake,mlx5_0,8xv100,v100,gpu 72 0.01 gpu:v100:8,mps:v100:800 gpu:v100:0(IDX:N/A),mps:v100:0(IDX:N/A) 0 1114594 idle up \r\n", 177 | "gladeslurm1 hsi 16 11.48 (null) gpu:0,mps:0 0 16338 idle up \r\n", 178 | "gladeslurm2 hsi 16 14.10 (null) gpu:0,mps:0 0 15644 idle up \r\n", 179 | "gladeslurm3 hsi 16 5.25 (null) gpu:0,mps:0 0 12911 idle up \r\n", 180 | "gladeslurm4 hsi 16 4.20 (null) gpu:0,mps:0 0 14423 idle up \r\n" 181 | ] 182 | } 183 | ], 184 | "source": [ 185 | "! sinfo --Format=\"NodeHost:15,Features:50,CPUs:5,CPUsLoad:10,Gres:30,GresUsed:50,AllocMem:.15,FreeMem:.15 ,StateLong:15,Available:6\"" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "The columns perform the following uses:\n", 193 | "- NodeHost: Prints the name of each node.\n", 194 | "- Features: Lists the CPU type (skylake or cascadelake), and the number and type of GPU if any\n", 195 | "- CPUs: Number of CPUs available, which is number of sockets * number of cores * threads per core (only for multithreading)\n", 196 | "- CPUsLoad: How many CPUs are currently being used\n", 197 | "- Gres: Number and type of GPUs\n", 198 | "- GresUsed: How many GPUs are currently allocated on the node\n", 199 | "- AllocMem: How much memory is allocated in MB\n", 200 | "- FreeMem: How much memory is free in MB\n", 201 | "- StateLong: Node usage, which can be idle, mixed, allocated, reserved, or drained\n", 202 | "- Available: up or down" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [] 211 | } 212 | ], 213 | "metadata": { 214 | "kernelspec": { 215 | "display_name": "Python 3", 216 | "language": "python", 217 | "name": "python3" 218 | }, 219 | "language_info": { 220 | "codemirror_mode": { 221 | "name": "ipython", 222 | "version": 3 223 | }, 224 | "file_extension": ".py", 225 | "mimetype": "text/x-python", 226 | "name": "python", 227 | "nbconvert_exporter": "python", 228 | "pygments_lexer": "ipython3", 229 | "version": "3.8.6" 230 | } 231 | }, 232 | "nbformat": 4, 233 | "nbformat_minor": 4 234 | } 235 | --------------------------------------------------------------------------------