├── docs ├── image │ ├── favicon.ico │ └── logo_aiaccel.png ├── source │ ├── _static │ │ └── logo_aiaccel.png │ ├── api_reference │ │ ├── index.rst │ │ ├── config.rst │ │ ├── hpo.rst │ │ └── torch.rst │ ├── contribution_guide │ │ ├── index.rst │ │ ├── issues.md │ │ ├── documentation.md │ │ ├── tests.md │ │ ├── coding_styles.md │ │ └── pull_requests.md │ ├── user_guide │ │ ├── index.md │ │ ├── torch.rst │ │ └── config.rst │ ├── index.rst │ └── conf.py ├── Makefile └── make.bat ├── MANIFEST.in ├── tests ├── config │ ├── test_base.yaml │ ├── test_resolve_path.yaml │ ├── test_conf.yaml │ ├── test_config_assets │ │ └── print_config.txt │ ├── apps │ │ ├── test_check_git.yaml │ │ └── test_check_git.py │ └── test_config.py ├── torch │ ├── datasets │ │ ├── test_hdf5_dataset_assets │ │ │ └── dataset.hdf5 │ │ ├── test_cached_dataset.py │ │ ├── test_scatter_dataset.py │ │ └── test_hdf5_dataset.py │ └── lightning │ │ └── test_abci_environment.py ├── job │ └── apps │ │ ├── config │ │ └── custom_local.yaml │ │ └── local.py └── hpo │ ├── apps │ ├── data │ │ ├── single_objective │ │ │ ├── config.yaml │ │ │ └── objective.py │ │ └── multi_objective │ │ │ ├── config.yaml │ │ │ └── objective.py │ └── test_optimize.py │ └── optuna │ ├── samplers │ ├── results_ackley_step.csv │ ├── results_ackley_int.csv │ ├── results_ackley.csv │ ├── results_ackley_logscale.csv │ └── results_shpere_parallel.csv │ └── test_hparams.py ├── examples ├── config │ └── basic │ │ ├── config.yaml │ │ └── example.py ├── hpo │ ├── benchmark │ │ ├── result_bbob_dim_vs_value-fopt_parallel.png │ │ ├── objective.sh │ │ ├── main_parallel_coco.py │ │ ├── README_ja.md │ │ ├── job_config.yaml │ │ ├── README.md │ │ ├── plot.py │ │ └── experiment_coco.py │ ├── basic │ │ ├── experiment │ │ │ └── config.yaml │ │ ├── objective.py │ │ └── README.md │ └── nelder_mead │ │ ├── example.py │ │ ├── README.md │ │ ├── example_parallel.py │ │ ├── example_sub_sampler.py │ │ ├── example_enqueue.py │ │ └── README_ja.md └── torch │ └── image_classification │ ├── recipes │ ├── resnet50.cifar10.ddp │ │ └── config.yaml │ └── resnet50.cifar10 │ │ └── config.yaml │ ├── pyproject.toml │ ├── src │ └── image_classification │ │ ├── small_resnet50.py │ │ └── task.py │ └── README.md ├── LICENSE_HEADER ├── .gitattributes ├── aiaccel ├── hpo │ ├── __init__.py │ ├── apps │ │ ├── __init__.py │ │ ├── config │ │ │ ├── __init__.py │ │ │ └── default.yaml │ │ └── optimize.py │ ├── optuna │ │ ├── __init__.py │ │ ├── samplers │ │ │ └── __init__.py │ │ ├── hparams.py │ │ └── hparams_manager.py │ └── algorithms │ │ └── __init__.py ├── torch │ ├── __init__.py │ ├── apps │ │ ├── __init__.py │ │ ├── config │ │ │ ├── __init__.py │ │ │ ├── train_base.yaml │ │ │ └── train_ddp.yaml │ │ └── train.py │ ├── h5py │ │ ├── __init__.py │ │ └── hdf5_writer.py │ ├── lr_schedulers │ │ ├── __init__.py │ │ └── sequential_lr.py │ ├── lightning │ │ ├── datamodules │ │ │ ├── __init__.py │ │ │ └── single_datamodule.py │ │ ├── callbacks │ │ │ ├── __init__.py │ │ │ ├── save_metric.py │ │ │ ├── print_unused_param.py │ │ │ └── load_pretrained.py │ │ ├── __init__.py │ │ ├── abci_environment.py │ │ ├── ckpt.py │ │ └── opt_lightning_module.py │ ├── pipelines │ │ └── __init__.py │ ├── functional │ │ ├── __init__.py │ │ └── linear_sum_assignment.py │ └── datasets │ │ ├── __init__.py │ │ ├── scatter_dataset.py │ │ ├── file_cached_dataset.py │ │ ├── hdf5_dataset.py │ │ └── cached_dataset.py ├── config │ ├── apps │ │ ├── __init__.py │ │ ├── get_value.py │ │ └── check_git.py │ ├── __init__.py │ └── git.py ├── job │ └── apps │ │ ├── config │ │ ├── __init__.py │ │ ├── local.yaml │ │ ├── slurm.yaml │ │ ├── sge.yaml │ │ └── pbs.yaml │ │ ├── local.py │ │ ├── __init__.py │ │ ├── slurm.py │ │ ├── sge.py │ │ └── pbs.py ├── __init__.py └── launcher.py ├── .readthedocs.yaml ├── mypy.ini ├── .github ├── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md └── workflows │ ├── lint.yaml │ ├── ci.yaml │ └── pypi-publish.yaml ├── LICENSE ├── typings └── h5py.pyi ├── .pre-commit-config.yaml ├── README.md ├── .gitignore └── pyproject.toml /docs/image/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aistairc/aiaccel/HEAD/docs/image/favicon.ico -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include aiaccel/hpo/apps/config/*.yaml 2 | include aiaccel/torch/apps/config/*.yaml 3 | -------------------------------------------------------------------------------- /tests/config/test_base.yaml: -------------------------------------------------------------------------------- 1 | A: 2 | - AAA: base 3 | D: 4 | _inherit_: ${E} 5 | E: 6 | EE: ee 7 | -------------------------------------------------------------------------------- /docs/image/logo_aiaccel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aistairc/aiaccel/HEAD/docs/image/logo_aiaccel.png -------------------------------------------------------------------------------- /examples/config/basic/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | _target_: torchvision.models.resnet50 3 | num_classes: 13 -------------------------------------------------------------------------------- /tests/config/test_resolve_path.yaml: -------------------------------------------------------------------------------- 1 | _base_: ${resolve_pkg_path:aiaccel.hpo.apps.config}/default.yaml 2 | 3 | -------------------------------------------------------------------------------- /docs/source/_static/logo_aiaccel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aistairc/aiaccel/HEAD/docs/source/_static/logo_aiaccel.png -------------------------------------------------------------------------------- /LICENSE_HEADER: -------------------------------------------------------------------------------- 1 | Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | SPDX-License-Identifier: MIT -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # SCM syntax highlighting & preventing 3-way merges 2 | pixi.lock merge=binary linguist-language=YAML linguist-generated=true 3 | -------------------------------------------------------------------------------- /aiaccel/hpo/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | -------------------------------------------------------------------------------- /aiaccel/torch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | -------------------------------------------------------------------------------- /aiaccel/config/apps/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | -------------------------------------------------------------------------------- /aiaccel/hpo/apps/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | -------------------------------------------------------------------------------- /aiaccel/hpo/optuna/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | -------------------------------------------------------------------------------- /aiaccel/torch/apps/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | -------------------------------------------------------------------------------- /tests/torch/datasets/test_hdf5_dataset_assets/dataset.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aistairc/aiaccel/HEAD/tests/torch/datasets/test_hdf5_dataset_assets/dataset.hdf5 -------------------------------------------------------------------------------- /aiaccel/hpo/apps/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | -------------------------------------------------------------------------------- /aiaccel/job/apps/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | -------------------------------------------------------------------------------- /aiaccel/torch/apps/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | -------------------------------------------------------------------------------- /examples/hpo/benchmark/result_bbob_dim_vs_value-fopt_parallel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aistairc/aiaccel/HEAD/examples/hpo/benchmark/result_bbob_dim_vs_value-fopt_parallel.png -------------------------------------------------------------------------------- /docs/source/api_reference/index.rst: -------------------------------------------------------------------------------- 1 | ############### 2 | API Reference 3 | ############### 4 | 5 | .. toctree:: 6 | :maxdepth: 2 7 | 8 | config 9 | torch 10 | hpo 11 | -------------------------------------------------------------------------------- /aiaccel/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from importlib.metadata import version 5 | 6 | __version__ = version(__package__) 7 | -------------------------------------------------------------------------------- /tests/config/test_conf.yaml: -------------------------------------------------------------------------------- 1 | _base_: test_base.yaml 2 | 3 | A: 4 | - _inherit_: ["${B}", "${C}"] 5 | AA: aa 6 | - AAA: aaa 7 | 8 | B: 9 | AA: dummy 10 | BB : bb 11 | 12 | C: 13 | CC: cc 14 | 15 | Eval: ${eval:"(21 + 9) / (4 + (8 % 3) ** 4)"} 16 | -------------------------------------------------------------------------------- /tests/job/apps/config/custom_local.yaml: -------------------------------------------------------------------------------- 1 | _base_: ${resolve_pkg_path:aiaccel.job.apps.config}/local.yaml 2 | 3 | script_prologue: | 4 | echo Hostname: $(hostname) 5 | 6 | export CUDA_VISIBLE_DEVICES=all 7 | 8 | echo ${config_path} | tee config_path.txt 9 | -------------------------------------------------------------------------------- /aiaccel/torch/h5py/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from aiaccel.torch.h5py.hdf5_writer import HDF5Writer 5 | 6 | __all__ = [ 7 | "HDF5Writer", 8 | ] 9 | -------------------------------------------------------------------------------- /aiaccel/torch/lr_schedulers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from aiaccel.torch.lr_schedulers.sequential_lr import SequentialLR 5 | 6 | __all__ = ["SequentialLR"] 7 | -------------------------------------------------------------------------------- /examples/hpo/benchmark/objective.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #$-l rt_C.small=1 4 | #$-cwd 5 | 6 | source /etc/profile.d/modules.sh 7 | module load gcc/13.2.0 8 | module load python/3.10/3.10.14 9 | source xxx/aiaccel_env/bin/activate 10 | 11 | python3.10 experiment_coco.py $@ 12 | -------------------------------------------------------------------------------- /tests/config/test_config_assets/print_config.txt: -------------------------------------------------------------------------------- 1 | ================================================================================ 2 | foo: 3 | bar: 4 | - 1 5 | - 2 6 | - 3 7 | ================================================================================ 8 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | 3 | build: 4 | os: "ubuntu-22.04" 5 | tools: 6 | python: "3.10" 7 | 8 | python: 9 | install: 10 | - method: pip 11 | path: . 12 | extra_requirements: 13 | - dev 14 | 15 | sphinx: 16 | configuration: docs/source/conf.py -------------------------------------------------------------------------------- /aiaccel/hpo/optuna/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from aiaccel.hpo.optuna.samplers.nelder_mead_sampler import NelderMeadSampler 5 | 6 | __all__ = ["NelderMeadSampler"] 7 | -------------------------------------------------------------------------------- /aiaccel/torch/apps/config/train_base.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | _target_: lightning.Trainer 3 | default_root_dir: ${working_directory} 4 | 5 | logger: 6 | _target_: lightning.pytorch.loggers.TensorBoardLogger 7 | save_dir: ${working_directory} 8 | name: '' 9 | version: '' 10 | -------------------------------------------------------------------------------- /aiaccel/torch/lightning/datamodules/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from aiaccel.torch.lightning.datamodules.single_datamodule import SingleDataModule 5 | 6 | __all__ = ["SingleDataModule"] 7 | -------------------------------------------------------------------------------- /aiaccel/torch/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from aiaccel.torch.pipelines.base_pipeline import BasePipeline, reorder_fields 5 | 6 | __all__ = ["BasePipeline", "reorder_fields"] 7 | -------------------------------------------------------------------------------- /aiaccel/torch/functional/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from aiaccel.torch.functional.linear_sum_assignment import linear_sum_assignment 5 | 6 | __all__ = [ 7 | "linear_sum_assignment", 8 | ] 9 | -------------------------------------------------------------------------------- /examples/torch/image_classification/recipes/resnet50.cifar10.ddp/config.yaml: -------------------------------------------------------------------------------- 1 | _base_: 2 | - ../resnet50.cifar10/config.yaml 3 | 4 | trainer: 5 | devices: "auto" 6 | sync_batchnorm: true 7 | 8 | datamodule: 9 | batch_size: 256 10 | 11 | task: 12 | optimizer_config: 13 | optimizer_generator: 14 | lr: 8.e-3 -------------------------------------------------------------------------------- /docs/source/contribution_guide/index.rst: -------------------------------------------------------------------------------- 1 | #################### 2 | Contribution Guide 3 | #################### 4 | 5 | Thank you for contributing to aiaccel! This document introduces how to contribute. 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | 10 | issues 11 | pull_requests 12 | documentation 13 | tests 14 | coding_styles 15 | -------------------------------------------------------------------------------- /examples/torch/image_classification/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "image_classification" 7 | version = "0.0.0" 8 | requires-python = ">=3.10" 9 | dependencies = [ 10 | "torchvision", 11 | "torchmetrics", 12 | "aiaccel" 13 | ] -------------------------------------------------------------------------------- /aiaccel/torch/apps/config/train_ddp.yaml: -------------------------------------------------------------------------------- 1 | _base_: ${resolve_pkg_path:aiaccel.torch.apps.config}/train_base.yaml 2 | 3 | trainer: 4 | sync_batchnorm: true 5 | 6 | plugins: 7 | _target_: aiaccel.torch.lightning.abci_environment.ABCIEnvironment 8 | devices: ${oc.decode:${oc.env:OMPI_COMM_WORLD_LOCAL_SIZE}} 9 | num_nodes: ${oc.decode:${oc.env:OMPI_MCA_orte_num_nodes}} 10 | -------------------------------------------------------------------------------- /tests/hpo/apps/data/single_objective/config.yaml: -------------------------------------------------------------------------------- 1 | _base_: ${resolve_pkg_path:aiaccel.hpo.apps.config}/default.yaml 2 | 3 | study: 4 | sampler: 5 | _target_: optuna.samplers.TPESampler 6 | seed: 0 7 | 8 | params: 9 | x1: [0, 1] 10 | x2: [0, 1] 11 | 12 | command: ["python", "${working_directory}/objective.py", "--x1={x1}", "--x2={x2}", "{out_filename}"] 13 | 14 | n_trials: 15 15 | n_max_jobs: 1 16 | -------------------------------------------------------------------------------- /aiaccel/hpo/algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from aiaccel.hpo.algorithms.nelder_mead_algorithm import NelderMeadAlgorism, NelderMeadCoefficient, NelderMeadEmptyError 5 | 6 | __all__ = [ 7 | "NelderMeadCoefficient", 8 | "NelderMeadEmptyError", 9 | "NelderMeadAlgorism", 10 | ] 11 | -------------------------------------------------------------------------------- /aiaccel/hpo/apps/config/default.yaml: -------------------------------------------------------------------------------- 1 | db_filename: ${working_directory}/optuna.db 2 | 3 | n_trials: 100 4 | n_max_jobs: 10 5 | 6 | study: 7 | _target_: optuna.create_study 8 | study_name: aiaccel-hpo 9 | storage: 10 | _target_: optuna.storages.RDBStorage 11 | url: sqlite:///${db_filename} 12 | 13 | load_if_exists: True 14 | 15 | params: 16 | _convert_: partial 17 | _target_: aiaccel.hpo.optuna.hparams_manager.HparamsManager 18 | -------------------------------------------------------------------------------- /tests/hpo/apps/data/multi_objective/config.yaml: -------------------------------------------------------------------------------- 1 | _base_: ${resolve_pkg_path:aiaccel.hpo.apps.config}/default.yaml 2 | 3 | study: 4 | directions: ["minimize", "minimize"] 5 | sampler: 6 | _target_: optuna.samplers.NSGAIISampler 7 | seed: 0 8 | 9 | params: 10 | x1: [0, 1] 11 | x2: [0, 1] 12 | 13 | command: ["python", "${working_directory}/objective.py", "--x1={x1}", "--x2={x2}", "{out_filename}"] 14 | 15 | n_trials: 15 16 | n_max_jobs: 1 17 | -------------------------------------------------------------------------------- /aiaccel/torch/lightning/callbacks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from aiaccel.torch.lightning.callbacks.load_pretrained import LoadPretrainedCallback 5 | from aiaccel.torch.lightning.callbacks.print_unused_param import PrintUnusedParam 6 | from aiaccel.torch.lightning.callbacks.save_metric import SaveMetricCallback 7 | 8 | __all__ = ["SaveMetricCallback", "LoadPretrainedCallback", "PrintUnusedParam"] 9 | -------------------------------------------------------------------------------- /examples/hpo/basic/experiment/config.yaml: -------------------------------------------------------------------------------- 1 | _base_: ${resolve_pkg_path:aiaccel.hpo.apps.config}/default.yaml 2 | 3 | command: ["./objective.py", "--x1={x1}", "--x2={x2}", "{out_filename}"] 4 | 5 | params: 6 | _convert_: partial 7 | _target_: aiaccel.hpo.optuna.hparams_manager.HparamsManager 8 | x1: 9 | _target_: aiaccel.hpo.optuna.hparams.Float 10 | low: 0.0 11 | high: 1.0 12 | x2: 13 | _target_: aiaccel.hpo.optuna.hparams.Float 14 | low: 0.0 15 | high: 1.0 16 | 17 | n_trials: 100 18 | n_max_jobs: 50 19 | -------------------------------------------------------------------------------- /aiaccel/torch/lightning/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from aiaccel.torch.lightning.abci_environment import ABCIEnvironment 5 | from aiaccel.torch.lightning.ckpt import load_checkpoint 6 | from aiaccel.torch.lightning.opt_lightning_module import OptimizerConfig, OptimizerLightningModule, build_param_groups 7 | 8 | __all__ = ["ABCIEnvironment", "OptimizerConfig", "OptimizerLightningModule", "build_param_groups", "load_checkpoint"] 9 | -------------------------------------------------------------------------------- /examples/config/basic/example.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from argparse import ArgumentParser 5 | 6 | from hydra.utils import instantiate 7 | 8 | from aiaccel.config import prepare_config, print_config 9 | 10 | parser = ArgumentParser() 11 | parser.add_argument("config", type=str, help="Config file in YAML format") 12 | args, unk_args = parser.parse_known_args() 13 | 14 | config = prepare_config(args.config) 15 | print_config(config) 16 | 17 | model = instantiate(config.model) 18 | 19 | print(model) 20 | -------------------------------------------------------------------------------- /aiaccel/torch/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from aiaccel.torch.datasets.cached_dataset import CachedDataset 5 | from aiaccel.torch.datasets.file_cached_dataset import FileCachedDataset 6 | from aiaccel.torch.datasets.hdf5_dataset import HDF5Dataset, RawHDF5Dataset 7 | from aiaccel.torch.datasets.scatter_dataset import scatter_dataset 8 | 9 | __all__ = [ 10 | "CachedDataset", 11 | "FileCachedDataset", 12 | "RawHDF5Dataset", 13 | "HDF5Dataset", 14 | "scatter_dataset", 15 | ] 16 | -------------------------------------------------------------------------------- /examples/torch/image_classification/src/image_classification/small_resnet50.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from torch import nn 5 | 6 | from torchvision import models 7 | 8 | 9 | class SmallResNet50(nn.Sequential): 10 | def __init__(self, num_classes: int): 11 | super().__init__() 12 | 13 | self.base = models.resnet50(num_classes=num_classes) 14 | self.base.conv1 = nn.Conv2d(3, 64, 3, 1, 1, bias=False) 15 | self.base.maxpool = nn.Identity() 16 | self.base.fc = nn.Linear(2048, 10) 17 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | python_version = 3.10 3 | cache_dir = .mypy_cache 4 | mypy_path = typings/ 5 | 6 | plugins = numpy.typing.mypy_plugin 7 | 8 | allow_redefinition = True 9 | warn_unused_configs = True 10 | warn_redundant_casts = True 11 | show_error_codes = True 12 | show_column_numbers = True 13 | check_untyped_defs = True 14 | local_partial_types = True 15 | enable_error_code = possibly-undefined 16 | warn_unused_ignores = False 17 | 18 | strict_optional = True 19 | warn_no_return = True 20 | disallow_any_unimported = True 21 | strict = True 22 | implicit_reexport = False 23 | ignore_missing_imports = True 24 | exclude = build 25 | -------------------------------------------------------------------------------- /examples/hpo/basic/objective.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 4 | # SPDX-License-Identifier: MIT 5 | 6 | import argparse 7 | from pathlib import Path 8 | 9 | if __name__ == "__main__": 10 | parser = argparse.ArgumentParser() 11 | 12 | parser.add_argument("out_filename", type=Path) 13 | parser.add_argument("--x1", type=float) 14 | parser.add_argument("--x2", type=float) 15 | 16 | args = parser.parse_args() 17 | 18 | y = (args.x1**2) + (args.x2**2) 19 | 20 | with open(args.out_filename, "w") as f: 21 | f.write(f"{y:f}") 22 | -------------------------------------------------------------------------------- /docs/source/api_reference/config.rst: -------------------------------------------------------------------------------- 1 | ##################### 2 | OmegaConf Utilities 3 | ##################### 4 | 5 | ****************** 6 | Config Utilities 7 | ****************** 8 | 9 | .. currentmodule:: aiaccel.config 10 | 11 | .. autosummary:: 12 | :toctree: generated/ 13 | 14 | setup_omegaconf 15 | prepare_config 16 | load_config 17 | print_config 18 | resolve_inherit 19 | pathlib2str_config 20 | 21 | *************** 22 | Git Utilities 23 | *************** 24 | 25 | .. currentmodule:: aiaccel.config 26 | 27 | .. autosummary:: 28 | :toctree: generated/ 29 | 30 | collect_git_status_from_config 31 | print_git_status 32 | PackageGitStatus 33 | -------------------------------------------------------------------------------- /docs/source/user_guide/index.md: -------------------------------------------------------------------------------- 1 | # User Guide 2 | 3 | ## Installation 4 | You can install aiaccel directly from PyPI: 5 | ```bash 6 | python -m pip install aiaccel 7 | ``` 8 | 9 | ## Tutorials 10 | When you want to try the tutorials, we recommend setting up the environment with `pixi`, which installs aiaccel together with every required dependency. 11 | First, install `pixi` by following the instructions at: [https://pixi.sh/latest/installation/]() 12 | 13 | ```bash 14 | git clone https://github.com/aistairc/aiaccel.git 15 | cd aiaccel 16 | pixi install 17 | pixi shell # enter the environment interactively 18 | ``` 19 | 20 | ```{toctree} 21 | :maxdepth: 1 22 | config 23 | torch 24 | hpo 25 | ``` 26 | -------------------------------------------------------------------------------- /tests/hpo/apps/data/multi_objective/objective.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | import argparse 5 | 6 | 7 | def main() -> None: 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("out_filename", type=str) 10 | parser.add_argument("--x1", type=float) 11 | parser.add_argument("--x2", type=float) 12 | args = parser.parse_args() 13 | 14 | y1 = (args.x1 - 2) ** 2 + (args.x2 - 1) ** 2 15 | y2 = args.x1 + args.x2 16 | 17 | with open(args.out_filename, "w") as f: 18 | f.write(f"{[y1, y2]}") 19 | 20 | 21 | if __name__ == "__main__": 22 | main() 23 | -------------------------------------------------------------------------------- /tests/hpo/apps/data/single_objective/objective.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | import argparse 5 | 6 | 7 | def main() -> None: 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("out_filename", type=str) 10 | parser.add_argument("--x1", type=float) 11 | parser.add_argument("--x2", type=float) 12 | args = parser.parse_args() 13 | 14 | y = (args.x1**2) - (4.0 * args.x1) + (args.x2**2) - args.x2 - (args.x1 * args.x2) 15 | 16 | with open(args.out_filename, "w") as f: 17 | f.write(f"{y}") 18 | 19 | 20 | if __name__ == "__main__": 21 | main() 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /aiaccel/config/apps/get_value.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | import argparse 5 | 6 | from omegaconf import OmegaConf as oc # noqa: N813 7 | 8 | from aiaccel.config.config import prepare_config 9 | 10 | 11 | def main() -> None: 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("config", help="Configuration file path") 14 | parser.add_argument("key", help="Target key in the configration file") 15 | 16 | args, _ = parser.parse_known_args() 17 | config = prepare_config(args.config) 18 | 19 | print(oc.select(config, args.key)) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /aiaccel/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from aiaccel.config.config import ( 5 | load_config, 6 | pathlib2str_config, 7 | prepare_config, 8 | print_config, 9 | resolve_inherit, 10 | setup_omegaconf, 11 | ) 12 | from aiaccel.config.git import PackageGitStatus, collect_git_status_from_config, print_git_status 13 | 14 | __all__ = [ 15 | "prepare_config", 16 | "load_config", 17 | "pathlib2str_config", 18 | "print_config", 19 | "resolve_inherit", 20 | "PackageGitStatus", 21 | "collect_git_status_from_config", 22 | "print_git_status", 23 | "setup_omegaconf", 24 | ] 25 | -------------------------------------------------------------------------------- /docs/source/contribution_guide/issues.md: -------------------------------------------------------------------------------- 1 | # Issues 2 | When you find any problems or have requests for new features, please first check to ensure that there is no duplicate issues already posted. 3 | We usually use Japanese for internal development, but we are more than happy to communicate with you in English. 4 | 5 | ## Bug report 6 | A bug report should briefly summarize the following details: 7 | * What the bug is 8 | * Steps to reproduce the bug 9 | * What you expected to happen 10 | * The execution environment 11 | 12 | ## Feature request 13 | 14 | A feature request should briefly summarize the following details: 15 | 16 | - If a bug is relevant, what it is 17 | - What new features do you want to achieve 18 | - Description of the implementation you consider 19 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | # Minimal makefile for Sphinx documentation 5 | # 6 | 7 | # You can set these variables from the command line, and also 8 | # from the environment for the first two. 9 | SPHINXOPTS ?= 10 | SPHINXBUILD ?= sphinx-build 11 | SOURCEDIR = source 12 | BUILDDIR = build 13 | 14 | .PHONY: livehtml html apidoc 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | html: 19 | @$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 20 | 21 | livehtml: 22 | sphinx-autobuild -b html "$(SOURCEDIR)" "$(BUILDDIR)/html" $(SPHINXOPTS) 23 | -------------------------------------------------------------------------------- /docs/source/api_reference/hpo.rst: -------------------------------------------------------------------------------- 1 | ############################# 2 | Hyperparameter Optimization 3 | ############################# 4 | 5 | ************ 6 | Algorithms 7 | ************ 8 | 9 | .. currentmodule:: aiaccel.hpo.algorithms 10 | 11 | .. autosummary:: 12 | :toctree: generated/ 13 | 14 | NelderMeadAlgorism 15 | 16 | ****************** 17 | Optuna Utilities 18 | ****************** 19 | 20 | Samplers 21 | ======== 22 | 23 | .. currentmodule:: aiaccel.hpo.optuna.samplers 24 | 25 | .. autosummary:: 26 | :toctree: generated/ 27 | 28 | NelderMeadSampler 29 | 30 | Hparam 31 | ====== 32 | 33 | .. currentmodule:: aiaccel.hpo.optuna.hparams 34 | 35 | .. autosummary:: 36 | :toctree: generated/ 37 | 38 | Hparam 39 | Const 40 | Float 41 | Int 42 | Categorical 43 | -------------------------------------------------------------------------------- /aiaccel/config/apps/check_git.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | import argparse 5 | 6 | from aiaccel.config.config import prepare_config 7 | from aiaccel.config.git import collect_git_status_from_config, print_git_status 8 | 9 | 10 | def main() -> None: 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("config", help="Configuration file path") 13 | 14 | args, _ = parser.parse_known_args() 15 | config = prepare_config(args.config) 16 | 17 | if len(git_status := collect_git_status_from_config(config)) > 0: 18 | print_git_status(git_status) 19 | 20 | exit(1) 21 | else: 22 | exit(0) 23 | 24 | 25 | if __name__ == "__main__": 26 | main() 27 | -------------------------------------------------------------------------------- /tests/config/apps/test_check_git.yaml: -------------------------------------------------------------------------------- 1 | storage: 2 | _target_: optuna.storages.RDBStorage 3 | url: sqlite:///aiaccel_storage.db 4 | engine_kwargs: 5 | connect_args: 6 | timeout: 30 7 | 8 | study: 9 | _target_: optuna.create_study 10 | direction: minimize 11 | storage: ${storage} 12 | study_name: my_study 13 | load_if_exists: false 14 | sampler: 15 | _target_: optuna.samplers.TPESampler 16 | seed: 0 17 | 18 | params: 19 | _convert_: partial 20 | _target_: aiaccel.hpo.optuna.hparams_manager.HparamsManager 21 | x1: 22 | _target_: aiaccel.hpo.optuna.hparams.Float 23 | low: 0.0 24 | high: 1.0 25 | log: false 26 | x2: 27 | _target_: aiaccel.hpo.optuna.hparams.Float 28 | low: 0.0 29 | high: 1.0 30 | log: false 31 | 32 | n_trials: 30 33 | n_max_jobs: 1 34 | -------------------------------------------------------------------------------- /examples/hpo/nelder_mead/example.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | import numpy as np 5 | 6 | import optuna 7 | 8 | from aiaccel.hpo.optuna.samplers.nelder_mead_sampler import NelderMeadSampler 9 | 10 | search_space = { 11 | "x": (-10.0, 10.0), 12 | "y": (-10.0, 10.0), 13 | } 14 | 15 | 16 | def sphere(trial: optuna.trial.Trial) -> float: 17 | params = [] 18 | for name, distribution in search_space.items(): 19 | params.append(trial.suggest_float(name, *distribution)) 20 | 21 | return float(np.sum(np.asarray(params) ** 2)) 22 | 23 | 24 | if __name__ == "__main__": 25 | study = optuna.create_study(sampler=NelderMeadSampler(search_space=search_space, seed=42)) 26 | study.optimize(func=sphere, n_trials=100) 27 | -------------------------------------------------------------------------------- /aiaccel/job/apps/config/local.yaml: -------------------------------------------------------------------------------- 1 | walltime: null 2 | 3 | script_prologue: | 4 | echo Hostname: $(hostname) 5 | 6 | export CUDA_VISIBLE_DEVICES=all 7 | 8 | cpu: 9 | job: "{command}" 10 | 11 | cpu-array: 12 | n_tasks_per_proc: null 13 | n_procs: 24 14 | job: "{command}" 15 | 16 | gpu: 17 | job: "{command}" 18 | 19 | gpu-array: 20 | n_tasks_per_proc: null 21 | n_procs: 8 22 | job: "CUDA_VISIBLE_DEVICES=$(( LOCAL_PROC_INDEX % {args.n_procs} )) {command}" 23 | 24 | mpi: 25 | n_nodes: null 26 | job: | 27 | mpirun -np {args.n_procs} \\ 28 | {command} 29 | 30 | train: 31 | job: | 32 | mpirun -np {args.n_gpus} \\ 33 | -x MAIN_ADDR=$(hostname -i) \\ 34 | -x MAIN_PORT=3000 \\ 35 | -x COLUMNS=120 \\ 36 | -x PYTHONUNBUFFERED=true \\ 37 | {command} 38 | -------------------------------------------------------------------------------- /examples/hpo/nelder_mead/README.md: -------------------------------------------------------------------------------- 1 | # Examples of NelderMeadSampler 2 | 3 | ## 1. File Structure 4 | 5 | ### example.py 6 | ### example_parallel.py 7 | ### example_enqueue.py 8 | ### example_sub_sampler.py 9 | 10 | - This code demonstrates the general usage of NelderMeadSampler. 11 | - For more information, please refer to the following documents : docs/source/user_guide/hpo 12 | 13 | ### coco 14 | 15 | - This directory contains code for verifying NelderMeadSampler using the black-box optimization evaluation framework coco. 16 | - For details, please refer to the README.md in the relevant directory. 17 | 18 | ## 2. Instructions 19 | 20 | - After installing aiaccel and activating the virtual environment, run the corresponding file. 21 | 22 | ```bash 23 | python example.py 24 | ``` 25 | 26 | ## 3. Checking Results 27 | 28 | - The execution results of the example code are displayed in the standard output. 29 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /examples/hpo/nelder_mead/example_parallel.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | import time 5 | 6 | import numpy as np 7 | 8 | import optuna 9 | 10 | from aiaccel.hpo.optuna.samplers.nelder_mead_sampler import NelderMeadSampler 11 | 12 | search_space = { 13 | "x": (-10.0, 10.0), 14 | "y": (-10.0, 10.0), 15 | } 16 | 17 | 18 | def sphere(trial: optuna.trial.Trial) -> float: 19 | params = [] 20 | time.sleep(0.01) 21 | 22 | for name, distribution in search_space.items(): 23 | params.append(trial.suggest_float(name, *distribution)) 24 | 25 | return float(np.sum(np.asarray(params) ** 2)) 26 | 27 | 28 | if __name__ == "__main__": 29 | study = optuna.create_study(sampler=NelderMeadSampler(search_space=search_space, seed=42, block=True)) 30 | study.optimize(func=sphere, n_trials=100, n_jobs=3) 31 | -------------------------------------------------------------------------------- /tests/torch/datasets/test_cached_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from collections import defaultdict 5 | 6 | from torch.utils.data import Dataset 7 | 8 | from aiaccel.torch.datasets.cached_dataset import CachedDataset 9 | 10 | 11 | def test_cached_dataset() -> None: 12 | class DummyDataset(Dataset[int]): 13 | def __init__(self) -> None: 14 | self.counter = defaultdict[int, int](lambda: 0) 15 | 16 | def __getitem__(self, index: int) -> int: 17 | self.counter[index] += 1 18 | return index 19 | 20 | orig_dataset = DummyDataset() 21 | dataset = CachedDataset(orig_dataset) 22 | 23 | for _ in range(2): 24 | for ii in range(5): 25 | assert dataset[ii] == ii 26 | 27 | assert all(count == 1 for count in orig_dataset.counter.values()) 28 | -------------------------------------------------------------------------------- /aiaccel/torch/lightning/callbacks/save_metric.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | import json 5 | 6 | import lightning 7 | 8 | 9 | class SaveMetricCallback(lightning.Callback): 10 | """ 11 | Lightning Callback for save metric in fit ends. 12 | 13 | Args: 14 | metric_name (str): Metric name to save 15 | output_path (str): File name to save 16 | """ 17 | 18 | def __init__(self, metric_name: str, output_path: str) -> None: 19 | super().__init__() 20 | self.metric_name = metric_name 21 | self.output_path = output_path 22 | 23 | def on_fit_end(self, trainer: lightning.Trainer, pl_module: lightning.LightningModule) -> None: 24 | metric_value = trainer.callback_metrics[self.metric_name].item() 25 | with open(self.output_path, "w") as f: 26 | json.dump(metric_value, f) 27 | -------------------------------------------------------------------------------- /tests/hpo/optuna/samplers/results_ackley_step.csv: -------------------------------------------------------------------------------- 1 | x,y,objective 2 | -8,27.0,19.62720599480816 3 | 12,5.5,18.625976717821782 4 | -20,-21.0,19.668950397165975 5 | -8,-2.5,15.605357241663704 6 | 12,-23.5,21.23920263185265 7 | -2,14.0,17.293294335267746 8 | -26,6.0,19.54061203177721 9 | 4,6.0,12.786686824116272 10 | -2,-11.0,15.88518677832371 11 | -2,-4.5,11.750950682245877 12 | 10,3.5,17.248230608441084 13 | -4,-1.0,8.836638915350669 14 | -10,-11.0,17.556692462851238 15 | 0,1.5,5.5411239587646826 16 | -2,5.0,10.661412934927588 17 | -2,3.0,7.9889108105187 18 | 2,5.0,10.661412934927588 19 | -2,0.5,6.776152740106655 20 | 0,-0.5,3.0836533599911533 21 | 0,-2.0,4.927233671124704 22 | 2,0.5,6.776152740106655 23 | 0,0.5,3.0836533599911533 24 | -2,-1.5,7.674511801927853 25 | 0,1.0,2.637531092108304 26 | 0,-0.5,3.0836533599911533 27 | 0,0.5,3.0836533599911533 28 | 0,0.0,0.0 29 | 0,0.5,3.0836533599911533 30 | 0,0.0,0.0 31 | 0,-0.5,3.0836533599911533 32 | -------------------------------------------------------------------------------- /examples/hpo/nelder_mead/example_sub_sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | import time 5 | 6 | import numpy as np 7 | 8 | import optuna 9 | 10 | from aiaccel.hpo.optuna.samplers.nelder_mead_sampler import NelderMeadSampler 11 | 12 | search_space = { 13 | "x": (-10.0, 10.0), 14 | "y": (-10.0, 10.0), 15 | } 16 | 17 | 18 | def sphere(trial: optuna.trial.Trial) -> float: 19 | params = [] 20 | time.sleep(0.01) 21 | 22 | for name, distribution in search_space.items(): 23 | params.append(trial.suggest_float(name, *distribution)) 24 | 25 | return float(np.sum(np.asarray(params) ** 2)) 26 | 27 | 28 | if __name__ == "__main__": 29 | study = optuna.create_study( 30 | sampler=NelderMeadSampler(search_space=search_space, seed=42, sub_sampler=optuna.samplers.TPESampler(seed=42)) 31 | ) 32 | study.optimize(func=sphere, n_trials=100, n_jobs=3) 33 | -------------------------------------------------------------------------------- /examples/hpo/nelder_mead/example_enqueue.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | import numpy as np 5 | 6 | import optuna 7 | 8 | from aiaccel.hpo.optuna.samplers.nelder_mead_sampler import NelderMeadSampler 9 | 10 | search_space = { 11 | "x": (-10.0, 10.0), 12 | "y": (-10.0, 10.0), 13 | } 14 | 15 | 16 | def sphere(trial: optuna.trial.Trial) -> float: 17 | params = [] 18 | for name, distribution in search_space.items(): 19 | params.append(trial.suggest_float(name, *distribution)) 20 | 21 | return float(np.sum(np.asarray(params) ** 2)) 22 | 23 | 24 | if __name__ == "__main__": 25 | study = optuna.create_study(sampler=NelderMeadSampler(search_space=search_space, seed=42)) 26 | study.enqueue_trial({"x": 1.0, "y": 1.0}) 27 | study.enqueue_trial({"x": 1.0, "y": 2.0}) 28 | study.enqueue_trial({"x": 2.0, "y": 1.0}) 29 | study.optimize(func=sphere, n_trials=100) 30 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /aiaccel/torch/lightning/callbacks/print_unused_param.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | import lightning as lt 5 | from lightning.pytorch.utilities import rank_zero_warn 6 | 7 | 8 | class PrintUnusedParam(lt.Callback): 9 | """Warn once when trainable parameters do not receive gradients.""" 10 | 11 | def __init__(self) -> None: 12 | super().__init__() 13 | self._has_warned = False 14 | 15 | def on_after_backward(self, trainer: lt.Trainer, pl_module: lt.LightningModule) -> None: # type: ignore[override] 16 | """Emit a warning for parameters that never collected gradients.""" 17 | if self._has_warned or not trainer.is_global_zero: 18 | return 19 | 20 | for name, param in pl_module.named_parameters(): 21 | if param.requires_grad and param.grad is None: 22 | rank_zero_warn(f"{name} is unused") 23 | 24 | self._has_warned = True 25 | -------------------------------------------------------------------------------- /tests/torch/datasets/test_scatter_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from torch.utils.data import Dataset 5 | 6 | from pytest_mock.plugin import MockerFixture 7 | 8 | from aiaccel.torch.datasets.scatter_dataset import scatter_dataset 9 | 10 | 11 | def test_scatter_dataset(mocker: MockerFixture) -> None: 12 | class DummyDataset(Dataset[int]): 13 | def __len__(self) -> int: 14 | return 16 15 | 16 | def __getitem__(self, index: int) -> int: 17 | return index 18 | 19 | orig_dataset = DummyDataset() 20 | 21 | mocker.patch("torch.distributed.get_world_size", return_value=4) 22 | indices = [] 23 | for rank in range(4): 24 | mocker.patch("torch.distributed.get_rank", return_value=rank) 25 | dataset = scatter_dataset(orig_dataset) 26 | 27 | assert len(dataset) == len(orig_dataset) // 4 28 | indices += list(dataset.indices) 29 | 30 | assert sorted(indices) == list(range(16)) 31 | -------------------------------------------------------------------------------- /examples/torch/image_classification/README.md: -------------------------------------------------------------------------------- 1 | # Training a ResNet50 on CIFAR-10 2 | 3 | ## Setup 4 | We assume the Python-environment setup at `examples/python`. 5 | ```bash 6 | pushd ../../python/ 7 | bash setup_python.sh 8 | . activate.sh 9 | popd 10 | ``` 11 | 12 | ```bash 13 | pip install -e . 14 | ``` 15 | 16 | ## Training on a single GPU 17 | ```bash 18 | qsub -I -P [group_name] -q rt_HG -l select=1 -l walltime=1:0:0 19 | 20 | cd $PBS_O_WORKDIR 21 | . ../../python/activate.sh 22 | 23 | cd recipes 24 | aiaccel-torch train resnet50.cifar10/config.yaml 25 | ``` 26 | 27 | ## Training on on multiple GPUs 28 | This script will automatically use all the GPUs in your computer. The hyperparameter is assumed to use eight GPUs. 29 | ```bash 30 | qsub -I -P [group_name] -q rt_HF -l select=1 -l walltime=1:0:0 31 | 32 | cd $PBS_O_WORKDIR 33 | . ../../python/activate.sh 34 | 35 | cd recipes 36 | aiaccel-torch train resnet50.cifar10.ddp/config.yaml 37 | ``` 38 | 39 | ## Detailed Descriptions [TDB] 40 | Detailed descriptions are available on the [aiaccel document](https://aistairc.github.io/aiaccel/user_guide/torch.html) -------------------------------------------------------------------------------- /.github/workflows/lint.yaml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: 4 | push: 5 | branches: ["main", "develop/*"] 6 | pull_request: 7 | 8 | jobs: 9 | lint: 10 | name: Lint 11 | runs-on: ${{ matrix.os }} 12 | env: 13 | PIP_INDEX_URL: https://download.pytorch.org/whl/cpu 14 | PIP_EXTRA_INDEX_URL: https://pypi.org/simple 15 | strategy: 16 | matrix: 17 | os: ['ubuntu-22.04'] 18 | python-version: ['3.10'] 19 | steps: 20 | - uses: actions/checkout@v4 21 | with: 22 | fetch-depth: 0 23 | - uses: actions/setup-python@v5 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | cache: 'pip' 27 | cache-dependency-path: pyproject.toml 28 | - name: Install dependencies 29 | run: | 30 | pip install .[dev,github-actions] 31 | - name: Perform ruff 32 | run: | 33 | ruff check 34 | ruff format --check 35 | - name: Perform mypy 36 | run: | 37 | mypy --config-file mypy.ini . --explicit-package-bases 38 | - name: Perform docstrfmt 39 | run: | 40 | docstrfmt --check docs/source/ 41 | -------------------------------------------------------------------------------- /docs/source/contribution_guide/documentation.md: -------------------------------------------------------------------------------- 1 | (documentation-wip)= 2 | # Documentation (WIP) 3 | 4 | ## Docstrings 5 | 6 | - Write a basic description of the implemented functions, the types and meanings of parameters and return values, and examples of their usage. 7 | - Write in accordance with the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings). 8 | - See also [Coding Conventions](coding_styles). 9 | 10 | ## Documentation 11 | 12 | - Create source files for documentation in a directory under docs. 13 | - The recommended document file format is markdown format. 14 | - Create documentation for any major feature additions. 15 | 16 | ## Confirming rendering 17 | 18 | If you have added, changed, or modified documents, make sure that it renders correctly in the local environment. 19 | Move to the aiaccel directory and execute the following command to generate an API reference. 20 | 21 | ~~~bash 22 | cd aiaccel 23 | sphinx-apidoc --maxdepth 2 -f -o ./docs/source/api_reference/ ./aiaccel/ 24 | ~~~ 25 | 26 | Move to aiaccel/docs and build html files to see how the document is rendered. 27 | 28 | ~~~bash 29 | cd docs 30 | make html 31 | ~~~ 32 | -------------------------------------------------------------------------------- /aiaccel/launcher.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 4 | # SPDX-License-Identifier: MIT 5 | 6 | from argparse import ArgumentParser 7 | import importlib 8 | from pathlib import Path 9 | import pkgutil 10 | import sys 11 | 12 | 13 | def main() -> None: 14 | target_module = Path(sys.argv[0]).stem.split("-")[-1] 15 | 16 | package = importlib.import_module(f"aiaccel.{target_module}.apps") 17 | 18 | modules = [name.replace("_", "-") for _, name, ispkg in pkgutil.iter_modules(package.__path__) if not ispkg] 19 | if not modules: 20 | raise RuntimeError(f"No apps found in aiaccel.{target_module}.apps") 21 | 22 | parser = ArgumentParser(description=f"Run aiaccel-{target_module} apps.", add_help=False) 23 | parser.add_argument("command", choices=modules, help="The command to run.") 24 | args, unk_args = parser.parse_known_args() 25 | 26 | module = importlib.import_module(f"aiaccel.{target_module}.apps.{args.command.replace('-', '_')}") 27 | 28 | sys.argv = [str(module.__file__)] + unk_args 29 | module.main() 30 | 31 | 32 | if __name__ == "__main__": 33 | main() 34 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: ["main", "develop/*"] 6 | pull_request: 7 | 8 | jobs: 9 | ci: 10 | name: CI 11 | runs-on: ${{ matrix.os }} 12 | env: 13 | PIP_INDEX_URL: https://download.pytorch.org/whl/cpu 14 | PIP_EXTRA_INDEX_URL: https://pypi.org/simple 15 | strategy: 16 | matrix: 17 | os: [ubuntu-22.04] 18 | python-version: ['3.10'] 19 | test_target: [config, hpo, torch] 20 | steps: 21 | - uses: actions/checkout@v4 22 | with: 23 | fetch-depth: 0 24 | - uses: actions/setup-python@v5 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | cache: 'pip' 28 | cache-dependency-path: pyproject.toml 29 | - name: Install dependencies 30 | run: | 31 | pip install -e .[dev,github-actions] 32 | - name: Run pytest 33 | run: | 34 | pytest -v \ 35 | -n auto \ 36 | --cov=aiaccel/${{ matrix.test_target }} \ 37 | --cov-branch \ 38 | --cov-append \ 39 | --cov-report=term-missing \ 40 | tests/${{ matrix.test_target }} 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 National Institute of Advanced Industrial Science and Technology (AIST) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /typings/h5py.pyi: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from collections.abc import KeysView 4 | from pathlib import Path 5 | from types import TracebackType 6 | 7 | # This stub is just for passing mypy. 8 | class Dataset: 9 | def __setitem__(self, arg: Any, value: Any) -> None: ... 10 | def __getitem__(self, key: Any) -> Any: ... 11 | 12 | class Group: 13 | def create_dataset(self, name: str, shape: Any | None = None, dtype: Any | None = None) -> Dataset: ... 14 | def __getitem__(self, name: Any) -> Dataset: ... 15 | def items(self) -> list[tuple[str, Any]]: ... 16 | 17 | class File: 18 | def __init__( 19 | self, 20 | name: str | Path, 21 | mode: str = "r", 22 | driver: str | None = None, 23 | comm: Any | None = None, 24 | ) -> None: ... 25 | def __enter__(self) -> File: ... 26 | def __exit__( 27 | self, 28 | ex_exc_type: type[BaseException] | None, 29 | exc_value: BaseException | None, 30 | traceback: TracebackType | None, 31 | ) -> bool: ... 32 | def keys(self) -> KeysView[str]: ... 33 | def __getitem__(self, key: str) -> Group | Dataset: ... 34 | def close(self) -> None: ... 35 | def create_group(self, name: str) -> Group: ... 36 | -------------------------------------------------------------------------------- /examples/hpo/nelder_mead/README_ja.md: -------------------------------------------------------------------------------- 1 | # NelderMeadSampler の examples 2 | 3 | ## 1. ファイル構成 4 | 5 | ### example.py 6 | 7 | - 一般的な NelderMeadSampler の使い方を示したコードです. 8 | - 最適化対象はベンチマーク関数 shpere になっています.(以下の example も記述が無い場合は同様) 9 | 10 | ### example_parallel.py 11 | 12 | - 並列実行時の NelderMeadSampler の使い方を示したコードです. 13 | - NelderMeadSampler の引数 block=True, study.optimize の引数 n_jobs=3 として、並列実行を有効にしています. 14 | - 並列実行を有効にすることで、初期点計算と shrink 時の計算を並列化でき、直列実行と比べて高速化できます. 15 | 16 | ### example_enqueue.py 17 | 18 | - optuna.study.enqueue_trial 利用時の NelderMeadSampler の使い方を示したコードです. 19 | - ask-tell インタフェースを利用し、 NelderMeadSampler がパラメータの出力に失敗した時に、ランダムなパラメータを enqueue_trial で探索しています. 20 | 21 | ### example_sub_sampler.py 22 | 23 | - sub_sampler 機能の利用時の NelderMeadSampler の使い方を示したコードです. 24 | - NelderMeadSampler の引数 sub_sampler=optuna.samplers.TPESampler として、NelderMeadSampler がパラメータの出力に失敗した時に、TPESampler で探索しています. 25 | - sub_sampler 機能の利用時は、並列であっても引数 block=False にする必要があります. (block=False でも並列実行は可能です.) 26 | 27 | ### coco 28 | 29 | - ブラックボックス最適化評価用フレームワーク coco を用いた NelderMeadSampler の検証用コードを含んだディレクトリです. 30 | - 詳細は該当ディレクトリ内の README.md を参照してください. 31 | 32 | ## 2. 動作説明 33 | 34 | - aiaccel のインストール・仮想環境の activate 後に、該当ファイルを実行してください. 35 | 36 | ```bash 37 | python example.py 38 | ``` 39 | 40 | ## 3. 結果の確認 41 | 42 | - example コードの実行結果は、標準出力に表示されます. 43 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_install_hook_types: 2 | - pre-commit 3 | - pre-push 4 | 5 | repos: 6 | - repo: https://github.com/astral-sh/ruff-pre-commit 7 | rev: v0.14.8 8 | hooks: 9 | - id: ruff 10 | args: [--fix] 11 | - id: ruff-format 12 | - repo: https://github.com/pre-commit/mirrors-mypy 13 | rev: v1.19.0 14 | hooks: 15 | - id: mypy 16 | language: system 17 | args: [--config-file, mypy.ini, --explicit-package-bases] 18 | - repo: https://github.com/LilSpazJoekp/docstrfmt 19 | rev: v2.0.0 20 | hooks: 21 | - id: docstrfmt 22 | args: [--check, docs/source/, --extend-exclude, docs/source/api_reference/generated] 23 | types_or: [rst] 24 | - repo: https://github.com/Lucas-C/pre-commit-hooks 25 | rev: v1.5.5 26 | hooks: 27 | - id: insert-license 28 | name: "Insert license header" 29 | args: 30 | - --license-filepath=LICENSE_HEADER 31 | - --detect-license-in-X-top-lines=16 32 | types_or: [python, makefile] 33 | 34 | - repo: local 35 | hooks: 36 | - id: pytest 37 | name: pytest 38 | entry: pytest -v -x -n auto --cov=aiaccel/ --cov-append tests/ 39 | stages: [pre-push] 40 | language: system 41 | pass_filenames: false 42 | always_run: true -------------------------------------------------------------------------------- /tests/torch/lightning/test_abci_environment.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | import os 5 | from unittest import mock 6 | 7 | import pytest 8 | 9 | from aiaccel.torch.lightning.abci_environment import ABCIEnvironment 10 | 11 | 12 | @mock.patch.dict( 13 | os.environ, 14 | { 15 | "OMPI_COMM_WORLD_SIZE": "8", 16 | "OMPI_COMM_WORLD_RANK": "6", 17 | "OMPI_COMM_WORLD_LOCAL_RANK": "2", 18 | "OMPI_COMM_WORLD_LOCAL_SIZE": "4", 19 | "MAIN_ADDR": "192.168.0.1", 20 | "MAIN_PORT": "3000", 21 | }, 22 | ) 23 | def test_abci_environment() -> None: 24 | env = ABCIEnvironment() 25 | 26 | assert env.creates_processes_externally 27 | 28 | assert env.main_address == "192.168.0.1" 29 | assert env.main_port == 3000 30 | 31 | assert env.detect() 32 | assert env.world_size() == 8 33 | assert env.global_rank() == 6 34 | assert env.node_rank() == 1 35 | assert env.local_rank() == 2 36 | 37 | env.validate_settings(4, 2) 38 | 39 | with pytest.raises(ValueError, match=r"^`num_devices` should match.*"): 40 | env.validate_settings(3, 2) 41 | 42 | with pytest.raises(ValueError, match=r"^`num_devices \* num_nodes` should match.*"): 43 | env.validate_settings(4, 1) 44 | -------------------------------------------------------------------------------- /aiaccel/job/apps/config/slurm.yaml: -------------------------------------------------------------------------------- 1 | walltime: "1:0:0" 2 | 3 | script_prologue: | 4 | echo Job ID: $SLURM_JOBID 5 | echo Hostname: $(hostname) 6 | 7 | sbatch: "sbatch --export=USE_SSH=1 --export=ALL" 8 | 9 | cpu: 10 | sbatch_args: "-p cpu1 -N 1" 11 | job: "{command}" 12 | 13 | cpu-array: 14 | n_tasks_per_proc: 64 15 | n_procs: 4 16 | sbatch_args: "-p cpu1 -N 1 --array=1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))" 17 | job: "{command}" 18 | 19 | gpu: 20 | sbatch_args: "-p gpu1 -N 1" 21 | job: "{command}" 22 | 23 | gpu-array: 24 | n_tasks_per_proc: 64 25 | n_procs: 4 26 | sbatch_args: "-p gpu1 -N 1 --array=1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))" 27 | job: "CUDA_VISIBLE_DEVICES=$(( LOCAL_PROC_INDEX % 8 )) {command}" 28 | 29 | mpi: 30 | n_nodes: 1 31 | sbatch_args: >- 32 | -p gpu1 -N {args.n_nodes} -n {args.n_procs} 33 | job: srun -n {args.n_procs} --cpu-bind=none --distribution=block:block {command} 34 | 35 | train: 36 | sbatch_args: >- 37 | -p gpu1 -N {args.n_gpus} 38 | job: | 39 | export MAIN_ADDR=$(hostname -i) 40 | export MAIN_PORT=3000 41 | export COLUMNS=120 42 | export PYTHONUNBUFFERED=true 43 | srun -n {args.n_gpus} --cpu-bind=none --distribution=block:block {command} 44 | 45 | use_scandir: False 46 | -------------------------------------------------------------------------------- /tests/config/apps/test_check_git.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from pathlib import Path 5 | 6 | from pytest_mock import MockerFixture 7 | 8 | from aiaccel.config.apps import check_git 9 | from aiaccel.config.git import PackageGitStatus 10 | 11 | 12 | def test_check_git(mocker: MockerFixture) -> None: 13 | mock_args = mocker.Mock() 14 | mock_args.config = str(Path(__file__).parent / "test_check_git.yaml") 15 | 16 | mock_argparse = mocker.patch("argparse.ArgumentParser.parse_known_args") 17 | mock_argparse.return_value = (mock_args, []) 18 | 19 | # Success 20 | mock_func = mocker.patch("aiaccel.config.apps.check_git.collect_git_status_from_config") 21 | mock_func.return_value = [] 22 | 23 | try: 24 | check_git.main() 25 | except SystemExit as e: 26 | if e.code != 0: 27 | raise AssertionError() from e 28 | 29 | # Failed 30 | mock_func = mocker.patch("aiaccel.config.apps.check_git.collect_git_status_from_config") 31 | mock_func.return_value = [PackageGitStatus("test_package", "test_id", [])] 32 | 33 | try: 34 | check_git.main() 35 | except SystemExit as e: 36 | if e.code != 1: 37 | raise AssertionError() from e 38 | else: 39 | raise AssertionError() 40 | -------------------------------------------------------------------------------- /tests/hpo/optuna/samplers/results_ackley_int.csv: -------------------------------------------------------------------------------- 1 | x,y,objective 2 | -2,9.014286128198322,14.586422044008504 3 | 4,1.973169683940732,9.375537945818003 4 | -6,-6.880109593275947,14.844107008867795 5 | -2,-0.6931908436032099,6.507841622369519 6 | 4,-7.7343072878607995,15.311178036343385 7 | 0,4.827137774183541,10.531632891221614 8 | 2,-3.5471589338460197,10.452142477225951 9 | 1,-1.4535847568386302,6.11338663731501 10 | -5,-4.119945284382571,12.343741882384256 11 | 2,0.4498909418599073,6.726787205494825 12 | -3,-2.5966665423017465,10.213321495706827 13 | 0,-0.31174842918050594,2.2163113158354655 14 | 5,-1.0721423424159262,10.429712991161596 15 | 0,-0.7879287183063894,2.97192765167404 16 | -1,0.3539076093517366,4.287560101074227 17 | 0,-0.09796548219585688,0.5135246948216725 18 | 0,0.37821480693002485,2.6103706703067395 19 | 0,0.08667892562092305,0.43335945922963504 20 | -1,0.3004618726055739,4.053178155393216 21 | 0,-0.15869585373398643,0.999428637748025 22 | 0,0.14740929715905438,0.9043866409467292 23 | 0,0.07088300943579284,0.328887506768492 24 | 0,0.2555274172525728,1.8077753439839057 25 | 0,-0.009592257333748577,0.029579255457047537 26 | -1,-0.025388173518878787,2.655523461532038 27 | 0,0.05866215083597304,0.25500217190066365 28 | 0,-0.021813115933571936,0.07431720082603732 29 | 0,-0.09006752410329355,0.4569753311471949 30 | 0,0.021479732101155946,0.07299288494041889 31 | 0,0.03370059070097575,0.12528010757207042 32 | -------------------------------------------------------------------------------- /aiaccel/torch/lr_schedulers/sequential_lr.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from collections.abc import Callable 5 | 6 | import torch 7 | 8 | 9 | class SequentialLR(torch.optim.lr_scheduler.SequentialLR): 10 | """ 11 | A wrapper of torch.optim.lr_scheduler.SequentialLR to use list of functions 12 | to create schedulers. 13 | 14 | Args: 15 | optimizer: Optimizer. 16 | schedulers_fn: List of functions to create schedulers. 17 | milestones: List of epoch indices. Must be increasing. 18 | 19 | ... code-block:: yaml 20 | 21 | scheduler_generator: 22 | _partial_: True 23 | _convert_: "all" 24 | _target_: aiaccel.lr_schedulers.SequentialLR 25 | schedulers_fn: 26 | - _target_: torch.optim.lr_scheduler.LinearLR 27 | _partial_: True 28 | start_factor: 1.e-3 29 | end_factor: 1.0 30 | total_iters: 5000 31 | - _target_: torch.optim.lr_scheduler.CosineAnnealingLR 32 | _partial_: True 33 | T_max: 95000 34 | milestones: [5000] 35 | """ 36 | 37 | def __init__( 38 | self, 39 | optimizer: torch.optim.Optimizer, 40 | schedulers_fn: list[Callable[[torch.optim.Optimizer], torch.optim.lr_scheduler._LRScheduler]], 41 | milestones: list[int], 42 | ): 43 | super().__init__(optimizer, [fn(optimizer) for fn in schedulers_fn], milestones) 44 | -------------------------------------------------------------------------------- /examples/hpo/benchmark/main_parallel_coco.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from concurrent.futures import ThreadPoolExecutor 5 | from itertools import product 6 | import subprocess 7 | 8 | 9 | def main() -> None: 10 | sampler_names = ["nelder-mead", "nelder-mead-subTPE", "TPE"] 11 | func_ids = list(range(1, 25)) 12 | dims = [2, 3, 5, 10, 20, 40] 13 | execute_times = ["0:01:00", "0:02:00", "0:03:00", "0:10:00", "0:30:00", "3:00:00"] 14 | instances = list(range(1, 16)) 15 | optuna_seeds = list(range(1, 16)) 16 | 17 | combinations = product( 18 | sampler_names, func_ids, zip(dims, execute_times, strict=False), zip(instances, optuna_seeds, strict=False) 19 | ) 20 | 21 | with ThreadPoolExecutor() as pool: 22 | for sampler_name, func_id, (dim, execute_time), (instance, optuna_seed) in combinations: 23 | execute_time = "0:05:00" if sampler_name == "nelder-mead" else execute_time 24 | print(sampler_name, (func_id, execute_time), dim, (instance, optuna_seed)) 25 | 26 | aiaccel_job_command = f"""\ 27 | aiaccel-job pbs --config job_config.yaml cpu --walltime {execute_time} log/job_{func_id}_{dim}_{instance}.log \ 28 | -- python3.13 experiment_coco.py --func_id {func_id} --dim {dim} \ 29 | --instance {instance} --optuna_seed {optuna_seed} --sampler_name {sampler_name} 30 | """ 31 | 32 | pool.submit(subprocess.run, aiaccel_job_command, shell=True) 33 | 34 | 35 | if __name__ == "__main__": 36 | main() 37 | -------------------------------------------------------------------------------- /tests/hpo/optuna/test_hparams.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | import optuna 5 | 6 | from aiaccel.hpo.optuna.hparams import ( 7 | Categorical, 8 | Const, 9 | Float, 10 | Int, 11 | ) 12 | 13 | 14 | def test_const() -> None: 15 | const = Const(value=0.5) 16 | assert const(trial=None, name="x1") == 0.5 17 | 18 | 19 | def test_float() -> None: 20 | suggest_float = Float(low=0.0, high=1.0, step=None, log=False) 21 | trial = optuna.create_study().ask() 22 | 23 | assert isinstance(suggest_float(trial=trial, name="x2"), float) 24 | 25 | 26 | def test_int() -> None: 27 | suggest_int = Int(low=0, high=10, step=1, log=False) 28 | trial = optuna.create_study().ask() 29 | 30 | assert isinstance(suggest_int(trial=trial, name="x3"), int) 31 | 32 | 33 | def test_categorical() -> None: 34 | suggest_categorical = Categorical(choices=[0, 1, 2]) 35 | trial = optuna.create_study().ask() 36 | 37 | assert suggest_categorical(trial=trial, name="x4") in [0, 1, 2] 38 | 39 | 40 | def test_discrete_uniform() -> None: 41 | suggest_discrete_uniform = Float(low=0.0, high=1.0, step=0.1) 42 | trial = optuna.create_study().ask() 43 | 44 | assert isinstance(suggest_discrete_uniform(trial=trial, name="x5"), float) 45 | 46 | 47 | def test_log_uniform() -> None: 48 | suggest_log_uniform = Float(low=0.1, high=1.0, log=True) 49 | trial = optuna.create_study().ask() 50 | 51 | assert isinstance(suggest_log_uniform(trial=trial, name="x6"), float) 52 | -------------------------------------------------------------------------------- /aiaccel/hpo/optuna/hparams.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from typing import Generic, TypeVar 5 | 6 | from collections.abc import Sequence 7 | from dataclasses import dataclass 8 | 9 | from optuna.trial import Trial 10 | 11 | T = TypeVar("T") 12 | 13 | 14 | @dataclass 15 | class Hparam(Generic[T]): 16 | def __call__(self, trial: Trial, name: str) -> T: 17 | raise NotImplementedError 18 | 19 | 20 | @dataclass 21 | class Const(Hparam[T]): 22 | value: T 23 | 24 | def __call__(self, trial: Trial | None, name: str | None) -> T: 25 | return self.value 26 | 27 | 28 | @dataclass 29 | class Float(Hparam[float]): 30 | low: float 31 | high: float 32 | step: float | None = None 33 | log: bool = False 34 | 35 | def __call__(self, trial: Trial, name: str) -> float: 36 | return trial.suggest_float(name=name, low=self.low, high=self.high, step=self.step, log=self.log) 37 | 38 | 39 | @dataclass 40 | class Int(Hparam[int]): 41 | low: int 42 | high: int 43 | step: int = 1 44 | log: bool = False 45 | 46 | def __call__(self, trial: Trial, name: str) -> int: 47 | return trial.suggest_int(name=name, low=self.low, high=self.high, step=self.step, log=self.log) 48 | 49 | 50 | @dataclass 51 | class Categorical(Hparam[None | bool | int | float | str]): 52 | choices: Sequence[None | bool | int | float | str] 53 | 54 | def __call__(self, trial: Trial, name: str) -> None | bool | int | float | str: 55 | return trial.suggest_categorical(name=name, choices=self.choices) 56 | -------------------------------------------------------------------------------- /examples/hpo/benchmark/README_ja.md: -------------------------------------------------------------------------------- 1 | # coco を利用した NelderMeadSampler の検証用コード 2 | 3 | ## 1. ファイル構成 4 | 5 | ### nelder-mead 6 | ### nelder-mead-subTPE 7 | ### TPE 8 | 9 | - 各 sampler の最適化結果の csv が格納されるディレクトリです. 10 | 11 | ### experiment_coco.py 12 | 13 | - coco を用いて検証を行う本体のコードです. 14 | - 次元数*20 step、10並列での実行を想定しています. 15 | - 実行すると optuna の結果した結果が `optuna_csv` に、並列ステップ毎の結果が `step_csv` に出力されます. 16 | 17 | ### main_parallel_coco.py 18 | 19 | - `job_dispatcher` を用いて各 sampler ・関数・次元毎にジョブを投入するコードです. 20 | 21 | ### objective.sh 22 | 23 | - `job_dispatcher` で投入する qsub 用のスクリプトです. 24 | 25 | ### plot.py 26 | 27 | - matplotlib を用いて各 sampler の結果をグラフ化するコードです. 28 | - 各 sampler のディレクトリの `optuna_csv` を参照します. 29 | 30 | ### result_bbob_dim_vs_value-fopt_parallel.png 31 | 32 | - `plot.py` を実行して出力した検証結果を可視化したグラフ画像です. 33 | - 横軸 次元数 縦軸 最適化結果の平均・偏差 のグラフが、ベンチマーク関数24個分並んでいます. 34 | 35 | ## 2. 動作説明 36 | 37 | - aiaccel のインストール・仮想環境の activate を行ってください. 38 | 39 | - coco インストールを行ってください. 40 | - 詳細は下記 git を参照してください. 41 | https://github.com/numbbo/coco 42 | 43 | - `objective.sh` の仮想環境、 `main_parallel_coco.py` の job_group を適切なパス・ ID に書き換えてください. 44 | - `main_parallel.py` を実行すると、各 sampler の検証が実行されます. 45 | - 結果は各ディレクトリ直下の `optuna_csv`, `step_csv` に保存されます. 46 | 47 | ```bash 48 | cd nelder-mead 49 | python main_parallel.py 50 | ``` 51 | 52 | - `plot.py` の実行には、pandas, matplotlib のインストールが必要です. 53 | 54 | ```bash 55 | pip install pandas matplotlib 56 | python plot.py 57 | ``` 58 | 59 | ## 3. 結果の確認 60 | 61 | - 各 sampler の検証結果は sampler に対応したディレクトリ以下の `optuna_csv`, `step_csv` に出力されます. 62 | - `plot.py` の可視化結果は `result_bbob_dim_vs_value-fopt_parallel.png` に出力されます. 63 | - 可視化結果からは、並列実行時には nelder-mead-subTPE の方が良い結果が出やすい傾向があることが分かります. ただし、関数によっては nelder-mead の方が良い結果が出ることもあります. -------------------------------------------------------------------------------- /tests/job/apps/local.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from pathlib import Path 5 | import subprocess 6 | 7 | import pytest 8 | 9 | cmd = ["aiaccel-job", "local"] 10 | 11 | 12 | @pytest.mark.parametrize( 13 | "base_args", 14 | [ 15 | ["cpu"], 16 | ["cpu", "--n_tasks=10"], 17 | ["gpu"], 18 | ["gpu", "--n_tasks=10"], 19 | ], 20 | ) 21 | def test_default(base_args: list[str], tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: 22 | monkeypatch.chdir(tmp_path) 23 | log_path = tmp_path / "test.log" 24 | 25 | subprocess.run(cmd + base_args + [log_path, "--", "sleep", "0"], check=True) 26 | 27 | 28 | def test_config_from_argparse(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: 29 | monkeypatch.chdir(tmp_path) 30 | log_path = tmp_path / "test.log" 31 | 32 | config_path = Path(__file__).parent / "config" / "custom_local.yaml" 33 | 34 | subprocess.run(cmd + ["--config", config_path, "cpu", log_path, "sleep", "0"], check=True) 35 | 36 | with open(tmp_path / "config_path.txt") as f: 37 | assert Path(f.read().rstrip("\n")) == Path(config_path) 38 | 39 | 40 | def test_config_from_environ(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: 41 | monkeypatch.chdir(tmp_path) 42 | log_path = tmp_path / "test.log" 43 | 44 | config_path = Path(__file__).parent / "config" / "custom_local.yaml" 45 | monkeypatch.setenv("AIACCEL_JOB_CONFIG", str(config_path)) 46 | 47 | subprocess.run(cmd + ["cpu", log_path, "--", "sleep", "0"], check=True) 48 | 49 | with open(tmp_path / "config_path.txt") as f: 50 | assert Path(f.read().rstrip("\n")) == config_path 51 | -------------------------------------------------------------------------------- /aiaccel/torch/functional/linear_sum_assignment.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | import numpy as np 5 | from scipy.optimize import linear_sum_assignment as scipy_linear_sum_assignment 6 | 7 | import torch 8 | 9 | 10 | def linear_sum_assignment(cost_matrix: torch.Tensor, maximize: bool = False) -> tuple[torch.Tensor, torch.Tensor]: 11 | """ 12 | Solve the linear sum assignment problem for a batch of cost matrices. 13 | 14 | Args: 15 | cost_matrix (torch.Tensor): A tensor of shape (..., m, n) 16 | representing the cost matrix for each assignment problem. 17 | maximize (bool): If True, the problem is treated as a maximization problem. 18 | If False, it is treated as a minimization problem. Defaults to False. 19 | Returns: 20 | tuple: A tuple containing two tensors: 21 | - row_indices: Indices of the rows assigned to each column. 22 | - col_indices: Indices of the columns assigned to each row. 23 | """ 24 | 25 | assert cost_matrix.ndim >= 2, "cost_matrix must have at least 2 dimensions" 26 | 27 | *batch_shape, m, n = cost_matrix.shape 28 | 29 | row_ind_list, col_ind_list = [], [] 30 | for cm in cost_matrix.reshape(-1, m, n).cpu().numpy(): 31 | row_ind, col_ind = scipy_linear_sum_assignment(cm, maximize=maximize) 32 | 33 | row_ind_list.append(row_ind) 34 | col_ind_list.append(col_ind) 35 | 36 | row_indices = torch.from_numpy(np.stack(row_ind_list).reshape(*batch_shape, -1)).to(cost_matrix.device) 37 | col_indices = torch.from_numpy(np.stack(col_ind_list).reshape(*batch_shape, -1)).to(cost_matrix.device) 38 | 39 | return row_indices, col_indices 40 | -------------------------------------------------------------------------------- /aiaccel/job/apps/config/sge.yaml: -------------------------------------------------------------------------------- 1 | walltime: "1:0:0" 2 | 3 | script_prologue: | 4 | echo Job ID: $JOB_ID 5 | echo Hostname: $(hostname) 6 | 7 | export CUDA_VISIBLE_DEVICES=all 8 | 9 | qsub: "qsub -g $JOB_GROUP -l h_rt={args.walltime}" 10 | 11 | cpu: 12 | qsub_args: "-l cpu_40=1" 13 | job: "{command}" 14 | 15 | cpu-array: 16 | n_tasks_per_proc: 128 17 | n_procs: 20 18 | qsub_args: "-l cpu_40=1 -t 1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))" 19 | job: "{command}" 20 | 21 | gpu: 22 | qsub_args: "-l gpu_1=1" 23 | job: "{command}" 24 | 25 | gpu-array: 26 | n_tasks_per_proc: 128 27 | n_procs: 1 28 | qsub_args: "-l gpu_1=1 -t 1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))" 29 | job: "{command}" 30 | 31 | mpi: 32 | n_nodes: 1 33 | qsub_args: "-l cpu_40={args.n_nodes}" 34 | job: | 35 | source /etc/profile.d/modules.sh 36 | module load openmpi 37 | 38 | mpirun -np {args.n_procs} --npernode $(( {args.n_procs} / {args.n_nodes} )) \\ 39 | -mca pml ob1 -mca btl self,tcp -mca btl_tcp_if_include bond0 \\ 40 | {command} 41 | 42 | train: 43 | qsub_args: "-l $( (({args.n_gpus}==1)) && printf node_q || printf node_f )=$(( ({args.n_gpus} + 3) / 4 ))" 44 | job: | 45 | source /etc/profile.d/modules.sh 46 | module load openmpi 47 | 48 | n_gpus=$(nvidia-smi -L | wc -l) 49 | 50 | mpirun -np {args.n_gpus} -map-by ppr:$n_gpus:node:PE=48 \\ 51 | -mca pml ob1 -mca btl self,tcp -mca btl_tcp_if_include bond0 \\ 52 | -x MAIN_ADDR=$(hostname -i) \\ 53 | -x MAIN_PORT=3000 \\ 54 | -x COLUMNS=120 \\ 55 | -x PYTHONUNBUFFERED=true \\ 56 | {command} 57 | 58 | use_scandir: False 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |
3 | 4 | 5 | 6 | 7 | 8 | 9 |
10 | 11 | # AIST Toolkit for Accelerating Machine Learning Research 12 | 13 | * **Research-Oriented**: designed to accelerate your research cycles written in Python 14 | * **HPC Optimized**: intended to use in HPC clusters, including [AI Bridging Cloud Infrastructure (ABCI)](https://abci.ai/) 15 | * **Highly Modular**: designed to let you pick up any part of aiaccel for your research project 16 | 17 | # Key Features 18 | * [PyTorch/Lightning Toolkit](https://aistairc.github.io/aiaccel/api_reference/torch.html): training toolkit for HPC clusters. 19 | * [Hyperparameter Optimization (HPO)](https://aistairc.github.io/aiaccel/api_reference/hpo.html): ready-to-use HPO algorithms/tools. 20 | * [OmegaConf Utilities](https://aistairc.github.io/aiaccel/api_reference/config.html): OmegaConf-based config utilities. 21 | 22 | 23 | # Installation 24 | ```bash 25 | pip install aiaccel 26 | ``` 27 | 28 | # Acknowledgement 29 | * Part of this software was developed in a project commissioned by the New Energy and Industrial Technology Development Organization (NEDO). 30 | * Part of this software was developed by using ABCI 3.0 provided by AIST and AIST Solutions. 31 | * Part of this software was developed by using the TSUBAME4.0 supercomputer at Institute of Science Tokyo. 32 | -------------------------------------------------------------------------------- /docs/source/contribution_guide/tests.md: -------------------------------------------------------------------------------- 1 | (test)= 2 | # Tests 3 | 4 | ## Adding tests 5 | 6 | - aiaccel uses pytest for testing. 7 | - Create a directory for unit test under tests directory. 8 | - The directory structure under `aiaccel/tests` corresponds to that under `aiaccel/aiaccel`, except for a few modeules such as config.py. For example, the test for `aiaccel/aiaccel/hpo/optuna/hparams.py` is `aiaccel/tests/hpo/optuna/test_hparams.py`. 9 | - If you have added a new feature or bug fix, please create codes for testing. 10 | 11 | 12 | ## Running tests (WIP) 13 | 14 | - Move to the aiaccel directory and execute the following command to run all codes for testing on the local environment. 15 | 16 | ~~~bash 17 | cd aiaccel 18 | pytest 19 | ~~~ 20 | 21 | - Specify a file name as an argument to run only specific code. 22 | 23 | ~~~bash 24 | pytest aiaccel/tests/hpo/optuna/test_hparams.py 25 | ~~~ 26 | 27 | - In addition, execute the following command to check coding styles. 28 | 29 | ~~~bash 30 | ruff check 31 | ruff format --check 32 | mypy --config-file mypy.ini . 33 | docstrfmt --check docs/source/ 34 | ~~~ 35 | 36 | 37 | ## Coverages 38 | 39 | No strict criteria for code coverage have been set, but this value should be fully considered when designing test. Plase note the following cases. 40 | 41 | - Significantly lower overall score. 42 | - Abnormally low coverage of a class or module. 43 | - Test does not cover a specific branch of the if statement. 44 | 45 | ### Measurement coverages 46 | 47 | Run pytest with the option `--cov` to measure C0 coverage. 48 | 49 | ~~~bash 50 | pytest --cov=aiaccel 51 | ~~~ 52 | 53 | - Replace `aiaccel` with the appropriate path to measure only the coverage of a specific test code. 54 | - Run pytest with the option `--cov` and `--cov-branch` to measure C1 coverage. 55 | 56 | ~~~bash 57 | pytest --cov=aiaccel --cov-branch 58 | ~~~ 59 | -------------------------------------------------------------------------------- /aiaccel/torch/datasets/scatter_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | import numpy.typing as npt 5 | from typing import TypeVar 6 | 7 | from collections.abc import Callable 8 | 9 | import numpy as np 10 | 11 | import torch.distributed as dist 12 | from torch.utils.data import Dataset, Subset 13 | 14 | T = TypeVar("T") 15 | 16 | 17 | def scatter_dataset( 18 | dataset: Dataset[T], 19 | permute_fn: Callable[[npt.NDArray[np.int64]], npt.NDArray[np.int64]] | None = None, 20 | ) -> Subset[T]: 21 | """ 22 | Splits a dataset into subsets and returns the subset corresponding to the current process rank. 23 | 24 | Args: 25 | dataset (Dataset[T]): The input dataset to be split. 26 | permute_fn (Callable[[npt.NDArray[np.int64]], npt.NDArray[np.int64]] | None, optional): 27 | A function that takes an array of indices and returns a permuted version of the array. 28 | If None, a default permutation function using np.random.Generator is used. 29 | Defaults to None. 30 | 31 | Returns: 32 | Subset[T]: The subset of the input dataset corresponding to the current process rank. 33 | """ 34 | 35 | if permute_fn is None: 36 | permute_fn = np.random.Generator(np.random.PCG64(0)).permutation 37 | 38 | world_size = dist.get_world_size() 39 | rank = dist.get_rank() 40 | 41 | dataset_size = len(dataset) # type: ignore[arg-type] 42 | total_size = int(np.ceil(dataset_size / world_size)) * world_size 43 | 44 | indices = permute_fn(np.arange(dataset_size)) 45 | repeated_indices = np.concatenate([indices, indices[: total_size - dataset_size]]) 46 | 47 | split_indices = np.split(repeated_indices, world_size) 48 | 49 | return Subset(dataset, list(split_indices[rank])) 50 | -------------------------------------------------------------------------------- /tests/hpo/optuna/samplers/results_ackley.csv: -------------------------------------------------------------------------------- 1 | x,y,objective 2 | 3.745401188473625,9.50714306409916,17.40563670799186 3 | 7.319939418114051,5.986584841970366,16.136857010179618 4 | 1.5601864044243652,1.5599452033620265,7.684141829279023 5 | 4.092732049871416,6.640204043382679,14.977162309601907 6 | 5.073199322630971,5.043329732671359,12.912400157971089 7 | 3.704712456699542,4.9709207556996855,12.974724337183765 8 | 2.9286732703557936,1.6323541803337003,9.149709591293272 9 | 3.658814580010525,3.3197397122596115,12.156637668428768 10 | 2.951622208700302,2.4579447020537373,10.106158974565025 11 | 1.5372374660798567,0.7343546816419897,6.41495086832353 12 | 0.16875060014842802,0.6619457046703161,3.578287844876108 13 | 1.2065902187692537,1.1290476982590896,5.27149310836656 14 | 1.1124539377693488,0.8149256915533463,4.47829195031602 15 | 0.07461431914852312,0.3478236979645728,2.528730695609976 16 | 0.6170681987089122,0.6599051964353954,4.589274831289604 17 | 0.12168245964847557,0.5048847013174445,3.2642552606368476 18 | 0.5935341284589359,0.5813746947589595,4.5089318967941985 19 | 0.34584125892871764,0.503864447199984,3.9172427824021305 20 | 0.2219948241636085,0.4651093234204963,3.454138754047353 21 | 0.16007160678105392,0.44573176153075245,3.198080417589434 22 | 0.11300346628110147,0.2886707581778808,2.280678955373986 23 | 0.10866396959741442,0.18056378660809896,1.4848792823714767 24 | 0.023206681964883624,0.08265572304191926,0.4282522158214057 25 | 0.07027482246483607,0.23971672639479097,1.789417957224991 26 | 0.061595829097461974,0.023502783255227255,0.2985090187707833 27 | 0.07553261256429361,0.11682151987833611,0.8464992601260697 28 | 0.058966934047733205,0.08495038651345468,0.5575744894703902 29 | 0.025835577014612393,0.02120811978369183,0.12407151097718838 30 | 0.033461192510460404,0.0525055872806894,0.2766147044342304 31 | 0.045622106929999184,0.030179818393708938,0.23278162079337505 32 | -------------------------------------------------------------------------------- /examples/torch/image_classification/src/image_classification/task.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | import torch 5 | from torch import nn 6 | from torch.nn import functional as fn 7 | 8 | from torchmetrics.classification import MulticlassAccuracy 9 | 10 | from aiaccel.torch.lightning import OptimizerConfig, OptimizerLightningModule 11 | 12 | 13 | class ImageClassificationTask(OptimizerLightningModule): 14 | def __init__(self, model: nn.Module, optimizer_config: OptimizerConfig, num_classes: int = 10): 15 | super().__init__(optimizer_config) 16 | 17 | self.model = model 18 | 19 | self.training_accuracy = MulticlassAccuracy(num_classes=num_classes) 20 | self.validation_accuracy = MulticlassAccuracy(num_classes=num_classes) 21 | 22 | @torch.compile 23 | def forward(self, x: torch.Tensor) -> torch.Tensor: 24 | return self.model(x) # type: ignore 25 | 26 | def training_step(self, batch: tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> torch.Tensor: 27 | x, y = batch 28 | 29 | logits = self(x) 30 | 31 | loss = fn.cross_entropy(logits, y) 32 | 33 | self.log_dict( 34 | { 35 | "training/loss": loss, 36 | "training/accuracy": self.training_accuracy(logits, y), 37 | }, 38 | prog_bar=True, 39 | ) 40 | 41 | return loss 42 | 43 | def validation_step(self, batch: tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> None: 44 | x, y = batch 45 | 46 | logits = self(x) 47 | 48 | loss = fn.cross_entropy(logits, y) 49 | 50 | self.log_dict( 51 | { 52 | "validation/loss": loss, 53 | "validation/accuracy": self.validation_accuracy(logits, y), 54 | }, 55 | prog_bar=True, 56 | ) 57 | -------------------------------------------------------------------------------- /aiaccel/torch/datasets/file_cached_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from typing import Any, TypeVar 5 | 6 | from multiprocessing import Manager 7 | from pathlib import Path 8 | import pickle as pkl 9 | import uuid 10 | 11 | from torch.utils.data import Dataset 12 | 13 | __all__ = ["FileCachedDataset"] 14 | 15 | 16 | T_co = TypeVar("T_co", covariant=True) 17 | 18 | 19 | class FileCachedDataset(Dataset[T_co]): 20 | """ 21 | A dataset wrapper that caches samples to disk to reduce memory usage. 22 | 23 | This class wraps an existing `torch.utils.data.Dataset` and caches samples 24 | as pickle files in a specified directory. 25 | 26 | Args: 27 | dataset (Dataset[T]): The dataset to wrap. 28 | cache_path (str | Path): Directory where cached samples will be stored. 29 | 30 | Methods: 31 | __len__(): Returns the number of samples in the dataset. 32 | __getitem__(index: int) -> Any: Retrieves a sample from cache or the original dataset. 33 | """ 34 | 35 | def __init__(self, dataset: Dataset[T_co], cache_path: str | Path) -> None: 36 | self.dataset = dataset 37 | 38 | self.manager = Manager() 39 | self.cache = self.manager.dict() 40 | 41 | self.cache_path = Path(cache_path) 42 | self.cache_path.mkdir(exist_ok=True, parents=True) 43 | 44 | def __len__(self) -> int: 45 | return len(self.dataset) # type: ignore[arg-type] 46 | 47 | def __getitem__(self, index: int) -> Any: 48 | if index not in self.cache: 49 | sample = self.dataset[index] 50 | 51 | self.cache[index] = self.cache_path / f"cache-{uuid.uuid4()}.pkl" 52 | with open(self.cache[index], "wb") as f: 53 | pkl.dump(sample, f) 54 | else: 55 | with open(self.cache[index], "rb") as f: 56 | sample = pkl.load(f) 57 | 58 | return sample 59 | -------------------------------------------------------------------------------- /tests/torch/datasets/test_hdf5_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from pathlib import Path 5 | 6 | import numpy as np 7 | 8 | import torch 9 | 10 | import h5py as h5 11 | 12 | from aiaccel.torch.datasets.hdf5_dataset import HDF5Dataset, RawHDF5Dataset 13 | 14 | # with h5.File(Path(__file__).parent / "test_hdf5_dataset_assets" / "dataset.hdf5", "w") as f: 15 | # for ii in range(10): 16 | # g = f.create_group(f"grp{ii}") # noqa: ERA001 17 | # g.create_dataset("foo", [2, 3, 4]) # noqa: ERA001 18 | # g.create_dataset("bar", [5, 6]) # noqa: ERA001 19 | 20 | # g["foo"][:] = np.random.randn(2, 3, 4) # noqa: ERA001 21 | # g["bar"][:] = np.random.randn(5, 6) # noqa: ERA001 22 | 23 | 24 | def test_raw_hdf5_dataset() -> None: 25 | hdf5_filename = Path(__file__).parent / "test_hdf5_dataset_assets" / "dataset.hdf5" 26 | f_hdf5 = h5.File(hdf5_filename) 27 | 28 | dataset = RawHDF5Dataset(hdf5_filename) 29 | 30 | assert len(dataset) == 10 31 | assert list(dataset.grp_list) == [f"grp{idx}" for idx in range(10)] 32 | 33 | sample = dataset[5] 34 | assert sorted(sample.keys()) == ["bar", "foo"] 35 | assert np.array_equal(sample["bar"], f_hdf5["grp5"]["bar"][:]) 36 | assert np.array_equal(sample["foo"], f_hdf5["grp5"]["foo"][:]) 37 | 38 | 39 | def test_hdf5_dataset() -> None: 40 | hdf5_filename = Path(__file__).parent / "test_hdf5_dataset_assets" / "dataset.hdf5" 41 | f_hdf5 = h5.File(hdf5_filename) 42 | 43 | dataset = HDF5Dataset(hdf5_filename) 44 | 45 | assert len(dataset) == 10 46 | assert list(dataset.grp_list) == [f"grp{idx}" for idx in range(10)] 47 | 48 | sample = dataset[5] 49 | assert sorted(sample.keys()) == ["bar", "foo"] 50 | assert isinstance(sample["bar"], torch.Tensor) 51 | assert isinstance(sample["foo"], torch.Tensor) 52 | assert np.array_equal(sample["bar"].numpy(), f_hdf5["grp5"]["bar"][:]) 53 | assert np.array_equal(sample["foo"].numpy(), f_hdf5["grp5"]["foo"][:]) 54 | -------------------------------------------------------------------------------- /aiaccel/job/apps/config/pbs.yaml: -------------------------------------------------------------------------------- 1 | walltime: "1:0:0" 2 | 3 | script_prologue: | 4 | echo Job ID: $PBS_JOBID 5 | echo Hostname: $(hostname) 6 | 7 | export CUDA_VISIBLE_DEVICES=all 8 | 9 | qsub: "qsub -P $JOB_GROUP -l walltime={args.walltime} -v USE_SSH=1" 10 | 11 | cpu: 12 | qsub_args: "-q rt_HF -l select=1" 13 | job: "{command}" 14 | 15 | cpu-array: 16 | n_tasks_per_proc: 128 17 | n_procs: 24 18 | qsub_args: "-q rt_HF -l select=1 -J 1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))" 19 | job: "{command}" 20 | 21 | gpu: 22 | qsub_args: "-q rt_HF -l select=1" 23 | job: "{command}" 24 | 25 | gpu-array: 26 | n_tasks_per_proc: 128 27 | n_procs: 8 28 | qsub_args: "-q rt_HF -l select=1 -J 1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))" 29 | job: "CUDA_VISIBLE_DEVICES=$(( LOCAL_PROC_INDEX % 8 )) {command}" 30 | 31 | mpi: 32 | n_nodes: 1 33 | qsub_args: >- 34 | -q rt_HF 35 | -l select={args.n_nodes}:mpiprocs=$(( {args.n_procs} / {args.n_nodes} )):ompthreads=$(( {args.n_nodes} * 96 / {args.n_procs} )) 36 | job: | 37 | source /etc/profile.d/modules.sh 38 | module load hpcx 39 | 40 | mpirun -np {args.n_procs} -bind-to none -map-by slot \\ 41 | -mca pml ob1 -mca btl self,tcp -mca btl_tcp_if_include bond0 \\ 42 | {command} 43 | 44 | train: 45 | qsub_args: >- 46 | -q $( (({args.n_gpus}==1)) && printf rt_HG || printf rt_HF ) 47 | -l select=$(( ({args.n_gpus} + 7) / 8 )):mpiprocs=$( (({args.n_gpus}==1)) && printf 1 || printf 8 ):ompthreads=$( (({args.n_gpus}==1)) && printf 8 || printf 12 ) 48 | job: | 49 | source /etc/profile.d/modules.sh 50 | module load hpcx 51 | 52 | mpirun -np {args.n_gpus} -bind-to none -map-by slot \\ 53 | -mca pml ob1 -mca btl self,tcp -mca btl_tcp_if_include bond0 \\ 54 | -x MAIN_ADDR=$(hostname -i) \\ 55 | -x MAIN_PORT=3000 \\ 56 | -x COLUMNS=120 \\ 57 | -x PYTHONUNBUFFERED=true \\ 58 | {command} 59 | 60 | use_scandir: False 61 | -------------------------------------------------------------------------------- /tests/hpo/optuna/samplers/results_ackley_logscale.csv: -------------------------------------------------------------------------------- 1 | x,y,objective 2 | 0.05564180225431373,32147.193482816965,20.814962543634394 3 | 208.90047049266641,9.695826644515218,21.448998676398375 4 | 0.0003632339256943143,0.0003630322466779861,0.0014595551233256288 5 | 0.9690955677354745,5.755253900459714,12.296378779985623 6 | 0.03231014054617535,38.33310112612593,21.3581260787541 7 | 0.018761886564029658,0.04570932895685663,0.20359442477314715 8 | 0.004495663495585621,3.4162066498470134,9.311384187577072 9 | 0.0034258048967100124,0.1179667403294333,0.6654758825765974 10 | 0.0019893000084822924,0.00014066643139422686,0.005746539690054675 11 | 0.0039935573463010235,0.0032139294641708098,0.015198674098908072 12 | 0.00018093674105720218,1.5889101235890364e-05,0.0005146143530829761 13 | 3.303793418604823e-05,4.1006628675986886e-05,0.00014901809147715994 14 | 0.000167582287432558,9.626334554501767e-05,0.000547623923871754 15 | 0.00011382796427517295,4.95699892804902e-05,0.0003515684113644113 16 | 2.078428499530732e-05,0.00012793034128993056,0.00036703327257470164 17 | 3.570121591230095e-05,7.594601695415612e-05,0.00023754589410174276 18 | 1.0362079557367228e-05,6.28261204382591e-05,0.00018020780579774964 19 | 2.5701380739833768e-05,6.208705753197202e-05,0.000190180515296845 20 | 1.3319973191802473e-05,4.149475743214036e-05,0.00012331408252919118 21 | 4.2468733735770256e-05,2.7083641296808898e-05,0.0001425348740156096 22 | 1.712220841850161e-05,2.7406035616097617e-05,9.142849883048143e-05 23 | 1.2326314004310186e-05,2.240486306276013e-05,7.234532742117494e-05 24 | 2.332753235476556e-05,2.8736733977299986e-05,0.00010472549182694024 25 | 2.158731737744854e-05,1.551623938723579e-05,7.521275362165625e-05 26 | 1.1406780985592453e-05,1.2097380968715272e-05,4.7035972471576315e-05 27 | 1.5999205650361907e-05,1.5982755233483398e-05,6.397754858866733e-05 28 | 1.2904227006156957e-05,1.7650495365273593e-05,6.185504950195764e-05 29 | 1.3932305367519067e-05,1.5282270702374627e-05,5.850288671993553e-05 30 | 1.2315557985445971e-05,1.0474235817618713e-05,4.573507709437763e-05 31 | 1.2850374704515058e-05,1.3115878062101893e-05,5.194419819432028e-05 32 | -------------------------------------------------------------------------------- /examples/hpo/benchmark/job_config.yaml: -------------------------------------------------------------------------------- 1 | walltime: "1:0:0" 2 | 3 | script_prologue: | 4 | echo Job ID: $PBS_JOBID 5 | echo Hostname: $(hostname) 6 | 7 | export NVIDIA_VISIBLE_DEVICES=all 8 | export JOB_GROUP=job_group 9 | 10 | # activate environment 11 | 12 | qsub: "qsub -P $JOB_GROUP -l walltime={args.walltime} -v USE_SSH=1" 13 | 14 | cpu: 15 | qsub_args: "-q rt_HF -l select=1" 16 | job: "{command}" 17 | 18 | cpu-array: 19 | n_tasks_per_proc: 128 20 | n_procs: 24 21 | qsub_args: "-q rt_HF -l select=1 -J 1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))" 22 | job: "{command}" 23 | 24 | gpu: 25 | qsub_args: "-q rt_HF -l select=1" 26 | job: "{command}" 27 | 28 | gpu-array: 29 | n_tasks_per_proc: 1 30 | n_procs: 1 31 | qsub_args: "-q rt_HF -l select=1 -J 1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))" 32 | job: "CUDA_VISIBLE_DEVICES=$(( LOCAL_PROC_INDEX % 8 )) {command}" 33 | 34 | mpi: 35 | n_nodes: 1 36 | qsub_args: >- 37 | -q rt_HF 38 | -l select={args.n_nodes}:mpiprocs=$(( {args.n_procs} / {args.n_nodes} )):ompthreads=$(( {args.n_nodes} * 96 / {args.n_procs} )) 39 | job: | 40 | source /etc/profile.d/modules.sh 41 | module load hpcx 42 | 43 | mpirun -np {args.n_procs} -bind-to none -map-by slot \\ 44 | -mca pml ob1 -mca btl self,tcp -mca btl_tcp_if_include bond0 \\ 45 | {command} 46 | 47 | train: 48 | qsub_args: >- 49 | -q $( (({args.n_gpus}==1)) && printf rt_HG || printf rt_HF ) 50 | -l select=$(( ({args.n_gpus} + 7) / 8 )):mpiprocs=$( (({args.n_gpus}==1)) && printf 1 || printf 8 ):ompthreads=$( (({args.n_gpus}==1)) && printf 8 || printf 12 ) 51 | job: | 52 | source /etc/profile.d/modules.sh 53 | module load hpcx 54 | 55 | mpirun -np {args.n_gpus} -bind-to none -map-by slot \\ 56 | -mca pml ob1 -mca btl self,tcp -mca btl_tcp_if_include bond0 \\ 57 | -x MAIN_ADDR=$(hostname -i) \\ 58 | -x MAIN_PORT=3000 \\ 59 | -x COLUMNS=120 \\ 60 | -x PYTHONUNBUFFERED=true \\ 61 | {command} 62 | -------------------------------------------------------------------------------- /aiaccel/torch/apps/train.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from argparse import ArgumentParser 5 | import logging 6 | import os 7 | 8 | from hydra.utils import instantiate 9 | from omegaconf import OmegaConf as oc # noqa: N813 10 | 11 | import lightning as lt 12 | 13 | from aiaccel.config import ( 14 | prepare_config, 15 | ) 16 | from aiaccel.config.git import collect_git_status_from_config, print_git_status 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | def get_rank(default: int = 0) -> int: 22 | for key in [ 23 | "GLOBAL_RANK", # PyTorch Lightning 24 | "RANK", # torchrun / deepspeed / pytorch launcher 25 | "OMPI_COMM_WORLD_RANK", # OpenMPI 26 | "PMI_RANK", # MPICH / Intel MPI 27 | "MV2_COMM_WORLD_RANK", # MVAPICH2 28 | "SLURM_PROCID", # Slurm 29 | ]: 30 | rank = os.environ.get(key) 31 | if rank is not None: 32 | try: 33 | return int(rank) 34 | except ValueError: 35 | pass 36 | 37 | return default 38 | 39 | 40 | def main() -> None: 41 | parser = ArgumentParser() 42 | parser.add_argument("config", type=str, help="Config file in YAML format") 43 | args, unk_args = parser.parse_known_args() 44 | 45 | is_rank_zero = get_rank() == 0 46 | config = prepare_config( 47 | config_filename=args.config, 48 | overwrite_config=oc.from_cli(unk_args), 49 | print_config=is_rank_zero, 50 | save_config=is_rank_zero, 51 | save_filename="merged_config.yaml", 52 | ) 53 | 54 | if is_rank_zero: 55 | status_list = collect_git_status_from_config(config) 56 | print_git_status(status_list) 57 | 58 | if "seed" in config: 59 | lt.seed_everything(config.seed, workers=True) 60 | 61 | # build trainer 62 | trainer: lt.Trainer = instantiate(config.trainer) 63 | 64 | # start training 65 | trainer.fit( 66 | model=instantiate(config.task), 67 | datamodule=instantiate(config.datamodule), 68 | ) 69 | 70 | 71 | if __name__ == "__main__": 72 | main() 73 | -------------------------------------------------------------------------------- /docs/source/api_reference/torch.rst: -------------------------------------------------------------------------------- 1 | ########################### 2 | PyTorch/Lightning Toolkit 3 | ########################### 4 | 5 | ********** 6 | Datasets 7 | ********** 8 | 9 | .. currentmodule:: aiaccel.torch.datasets 10 | 11 | .. autosummary:: 12 | :toctree: generated/ 13 | 14 | CachedDataset 15 | FileCachedDataset 16 | HDF5Dataset 17 | RawHDF5Dataset 18 | scatter_dataset 19 | 20 | ************ 21 | Functional 22 | ************ 23 | 24 | .. currentmodule:: aiaccel.torch.functional 25 | 26 | .. autosummary:: 27 | :toctree: generated/ 28 | 29 | linear_sum_assignment 30 | 31 | ************************** 32 | Learning Rate Schedulers 33 | ************************** 34 | 35 | .. currentmodule:: aiaccel.torch.lr_schedulers 36 | 37 | .. autosummary:: 38 | :toctree: generated/ 39 | 40 | SequentialLR 41 | 42 | **************************** 43 | Inference Pipeline Helpers 44 | **************************** 45 | 46 | .. currentmodule:: aiaccel.torch.pipelines 47 | 48 | .. autosummary:: 49 | :toctree: generated/ 50 | 51 | BasePipeline 52 | reorder_fields 53 | 54 | ********************* 55 | Lightning Utilities 56 | ********************* 57 | 58 | .. currentmodule:: aiaccel.torch.lightning 59 | 60 | .. autosummary:: 61 | :toctree: generated/ 62 | 63 | OptimizerLightningModule 64 | OptimizerConfig 65 | build_param_groups 66 | load_checkpoint 67 | ABCIEnvironment 68 | 69 | Lightning Datamodules 70 | ===================== 71 | 72 | .. currentmodule:: aiaccel.torch.lightning.datamodules 73 | 74 | .. autosummary:: 75 | :toctree: generated/ 76 | 77 | SingleDataModule 78 | 79 | Lightning Callbacks 80 | =================== 81 | 82 | .. currentmodule:: aiaccel.torch.lightning.callbacks 83 | 84 | .. autosummary:: 85 | :toctree: generated/ 86 | 87 | SaveMetricCallback 88 | LoadPretrainedCallback 89 | PrintUnusedParam 90 | 91 | **************** 92 | H5py Utilities 93 | **************** 94 | 95 | .. currentmodule:: aiaccel.torch.h5py 96 | 97 | .. autosummary:: 98 | :toctree: generated/ 99 | 100 | HDF5Writer 101 | -------------------------------------------------------------------------------- /tests/hpo/optuna/samplers/results_shpere_parallel.csv: -------------------------------------------------------------------------------- 1 | x,y,z,objective 2 | -7.527592869158251,27.04285838459497,13.919636508684306,981.7371245469577 3 | 5.919509051822196,-20.63888157345381,-20.64032877982784,887.0271921570455 4 | -26.514983269908033,21.970568746496113,6.066900704592527,1222.5575130073998 5 | 12.48435466776273,-28.76493034225185,28.19459112971966,1778.215298037157 6 | 1.5549994860073504,-9.653374244853046,13.988330303767995,291.27904239998924 7 | 26.479593715688896,-24.13683370230404,-1.2218086828428865,1285.2484409781014 8 | -13.2663390235088,10.443718134296077,4.244723357733673,303.084675938987 9 | -4.729101515525668,10.213339578295692,6.558605734621125,169.6920156679335 10 | 0.2196810170732446,-8.485493542103784,-6.188221157226787,110.34594149309659 11 | 11.296725015212084,-15.727403606736836,5.327753229707882,403.35217475518004 12 | -7.125573013828579,3.900937699037849,4.515480825727225,86.38067279468704 13 | -9.31166182752802,13.405896735006218,-10.731086701686952,381.5813350590154 14 | -1.1616658423764918,-3.8885564998882307,7.808476052404258,77.44263744313807 15 | -0.6492703772288824,-15.862081140265136,-2.4681152540179947,258.1187630302182 16 | -3.709143730951472,3.6944843986554847,4.301925487461345,45.91352508843491 17 | -8.21726940851094,10.956737273973852,17.27214273428867,485.9005228565786 18 | -1.8895565893228015,-3.624935838084375,-0.32313018434792173,16.814997050518407 19 | 2.6186622387280694,-6.446942991915929,3.3427000779512293,59.59410967268937 20 | -0.8250262119876444,-0.3630397876749818,-2.9274791316944886,9.382600204408494 21 | -0.6567063967932206,1.3997185684316427,-8.295456723743861,71.2050776179087 22 | -6.901146593569348,6.251282173846681,-2.6418226303386065,93.6835799331563 23 | 0.23871003065371532,-3.2723867004752765,1.8465694008787705,14.175315748443946 24 | 2.058561883846985,-8.534725949478572,-5.237952097570439,104.51536623877313 25 | -2.2672173272518577,0.6371818116219701,1.9169560912033994,9.220995725654728 26 | -0.012799083067722927,1.6261060537321832,0.880494424606376,3.4196551462747413 27 | 0.9255796700598163,4.251626999640462,1.4823067290835248,21.130263108786092 28 | -2.308738445525199,4.539218752261391,-1.9332551448019129,29.672255545630637 29 | -0.39815208839101324,-1.3194853372911095,0.9016132644585998,2.7124731194640512 30 | -0.9604194538194184,0.9922414730503442,5.393521651873405,30.99702447734313 31 | -0.8588745224455878,-0.02421947249365025,-0.8472289358025149,1.4560488978150692 32 | -------------------------------------------------------------------------------- /aiaccel/job/apps/local.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | 4 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 5 | # SPDX-License-Identifier: MIT 6 | 7 | import logging 8 | from math import ceil 9 | from pathlib import Path 10 | import shlex 11 | import subprocess 12 | 13 | from aiaccel.job.apps import prepare_argument_parser 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | def main() -> None: 19 | # Load configuration (from the default YAML string) 20 | config, parser, sub_parsers = prepare_argument_parser("local.yaml") 21 | 22 | args = parser.parse_args() 23 | mode = args.mode + "-array" if getattr(args, "n_tasks", None) is not None else args.mode 24 | 25 | for key in ["walltime", "n_nodes", "n_tasks_per_proc"]: 26 | if getattr(args, key, None) is not None: 27 | logger.warning(f"Argument '{key}' is defined for compatibility and will not be used in aiaccel-job local.") 28 | 29 | # Prepare the job script and arguments 30 | job = config[mode].job.format(command=shlex.join(args.command), args=args) 31 | 32 | if mode in ["cpu-array", "gpu-array"]: 33 | n_tasks_per_proc = ceil(args.n_tasks / args.n_procs) 34 | job = f"""\ 35 | for LOCAL_PROC_INDEX in {{1..{args.n_procs}}}; do 36 | TASK_INDEX=$(( 1 + {n_tasks_per_proc} * (LOCAL_PROC_INDEX - 1) )) 37 | 38 | if [[ $TASK_INDEX -gt {args.n_tasks} ]]; then 39 | break 40 | fi 41 | 42 | TASK_INDEX=$TASK_INDEX \\ 43 | TASK_STEPSIZE={n_tasks_per_proc} \\ 44 | {job} 2>&1 | tee {args.log_filename.with_suffix("")}.${{LOCAL_PROC_INDEX}}.log & 45 | 46 | pids[$LOCAL_PROC_INDEX]=$! 47 | done 48 | 49 | for i in "${{!pids[@]}}"; do 50 | wait ${{pids[$i]}} 51 | done 52 | """ 53 | else: 54 | job = f"{job} 2>&1 | tee {args.log_filename}" 55 | 56 | job_script = f"""\ 57 | #! /bin/bash 58 | 59 | set -eE -o pipefail 60 | trap 'exit $?' ERR EXIT # at error and exit 61 | trap 'echo 143' TERM # at termination (by job scheduler) 62 | trap 'kill 0' INT 63 | 64 | 65 | {config.script_prologue} 66 | 67 | {job} 68 | """ 69 | 70 | # Create the job script file, remove old status files, and run the job 71 | args.log_filename.parent.mkdir(exist_ok=True, parents=True) 72 | 73 | job_filename: Path = args.log_filename.with_suffix(".sh") 74 | with open(job_filename, "w") as f: 75 | f.write(job_script) 76 | 77 | subprocess.run(f"bash {job_filename}", shell=True, check=True) 78 | 79 | 80 | if __name__ == "__main__": 81 | main() 82 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. figure:: _static/logo_aiaccel.png 2 | :width: 600px 3 | 4 | ####################### 5 | Aiaccel Documantation 6 | ####################### 7 | 8 | Aiaccel is a toolkit for accelerating machine learning research. 9 | 10 | .. grid:: 1 1 2 2 11 | :gutter: 2 12 | :padding: 0 13 | :class-row: surface 14 | 15 | .. grid-item-card:: :octicon:`sliders;1.5em;sd-text-primary` Configuration management 16 | :link: user_guide/config.html 17 | 18 | OmegaConf-based config management. 19 | 20 | 21 | .. grid-item-card:: :octicon:`server;1.5em;sd-text-primary` Job management 22 | :link: user_guide/config.html 23 | 24 | HPC-oriented job abstraction. 25 | 26 | .. grid-item-card:: :octicon:`flame;1.5em;sd-text-primary` PyTorch/Lightning toolkit 27 | :link: user_guide/torch.html 28 | 29 | Training toolkit for HPC clusters. 30 | 31 | .. grid-item-card:: :octicon:`beaker;1.5em;sd-text-primary` Hyperparameter optimization 32 | :link: user_guide/hpo.html 33 | 34 | Ready-to-use algorithms/tools. 35 | 36 | ************** 37 | Key Features 38 | ************** 39 | 40 | :octicon:`zap;1em` Research-Oriented 41 | Designed to accelerate your research cycles written in Python 42 | 43 | :octicon:`cpu;1em` HPC Optimized 44 | Intended to use in high-performance computing (HPC) clusters, including AI Bridging 45 | Cloud Infrastructure (ABCI). 46 | 47 | :octicon:`server;1em` Highly Modular 48 | Designed to let you pick up any part of aiaccel for your research project. 49 | 50 | ************************ 51 | Aiaccel is used in ... 52 | ************************ 53 | 54 | - M3L: Multimodal machine listening toolkit (https://github.com/b-sigpro/m3l) 55 | - SBSS: Scalable blind source separation toolkit (https://github.com/b-sigpro/sbss) 56 | 57 | ***************** 58 | Acknowledgments 59 | ***************** 60 | 61 | - Part of this work was developed under a commissioned project of the New Energy and 62 | Industrial Technology Development Organization (NEDO). 63 | - Part of this software was developed by using ABCI 3.0 provided by AIST and AIST 64 | Solutions. 65 | - Part of this software was developed by using the TSUBAME4.0 supercomputer at Institute 66 | of Science Tokyo. 67 | 68 | .. toctree:: 69 | :hidden: 70 | 71 | user_guide/index.rst 72 | 73 | .. toctree:: 74 | :hidden: 75 | :maxdepth: 2 76 | 77 | api_reference/index.rst 78 | 79 | .. toctree:: 80 | :hidden: 81 | :maxdepth: 2 82 | 83 | contribution_guide/index.rst 84 | -------------------------------------------------------------------------------- /aiaccel/torch/lightning/abci_environment.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | import logging 5 | import os 6 | 7 | from lightning.fabric.plugins.environments.cluster_environment import ClusterEnvironment 8 | 9 | log = logging.getLogger(__name__) 10 | 11 | 12 | class ABCIEnvironment(ClusterEnvironment): 13 | """ 14 | Environment class for ABCI. 15 | 16 | This class provides methods to interact with the ABCI environment, 17 | such as retrieving the world size, global rank, node rank, and local rank. 18 | """ 19 | 20 | def __init__(self) -> None: 21 | self._world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) 22 | self._rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) 23 | self._local_rank = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]) 24 | self._local_size = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"]) 25 | 26 | self._main_address = os.environ["MAIN_ADDR"] 27 | self._main_port = int(os.environ["MAIN_PORT"]) 28 | 29 | @property 30 | def creates_processes_externally(self) -> bool: 31 | return True 32 | 33 | @property 34 | def main_address(self) -> str: 35 | return self._main_address 36 | 37 | @property 38 | def main_port(self) -> int: 39 | return self._main_port 40 | 41 | @staticmethod 42 | def detect() -> bool: 43 | return True 44 | 45 | def world_size(self) -> int: 46 | return self._world_size 47 | 48 | def global_rank(self) -> int: 49 | return self._rank 50 | 51 | def node_rank(self) -> int: 52 | return self._rank // self._local_size 53 | 54 | def local_rank(self) -> int: 55 | return self._local_rank 56 | 57 | def set_world_size(self, size: int) -> None: 58 | if size != self.world_size(): 59 | raise ValueError(f"`size` is expected to be {self.world_size()}, buf {size} is given.") 60 | 61 | def set_global_rank(self, rank: int) -> None: 62 | if rank != self.global_rank(): 63 | raise ValueError(f"`rank` is expected to be {self.global_rank()}, buf {rank} is given.") 64 | 65 | def validate_settings(self, num_devices: int, num_nodes: int) -> None: 66 | if num_devices != self._local_size: 67 | raise ValueError("`num_devices` should match ${OMPI_COMM_WORLD_LOCAL_SIZE}") 68 | 69 | if num_devices * num_nodes != self._world_size: 70 | raise ValueError("`num_devices * num_nodes` should match ${OMPI_COMM_WORLD_SIZE}") 71 | -------------------------------------------------------------------------------- /aiaccel/hpo/optuna/hparams_manager.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from typing import Any 5 | 6 | from collections.abc import Callable 7 | 8 | from optuna.trial import Trial 9 | 10 | from aiaccel.hpo.optuna.hparams import Const, Float, Hparam, T 11 | 12 | 13 | class HparamsManager: 14 | """ 15 | Manages hyperparameters for optimization. 16 | This class allows defining hyperparameters with various types and provides 17 | a method to suggest hyperparameters for a given trial. 18 | Attributes: 19 | params (dict): A dictionary where keys are hyperparameter names and values 20 | are callables that take a Trial object and return a hyperparameter value. 21 | Methods: 22 | __init__(**params_def: dict[str, int | float | str | list[int | float] | Hparam[T]]) -> None: 23 | Initializes the HparamsManager with the given hyperparameter definitions. 24 | suggest_hparams(trial: Trial) -> dict[str, float | int | str | list[float | int | str]]: 25 | Suggests hyperparameters for the given trial. 26 | """ 27 | 28 | def __init__(self, **params_def: dict[str, int | float | str | list[int | float] | Hparam[T]]) -> None: 29 | self.params: dict[str, Callable[[Trial, str], Any]] = {} 30 | for name, param in params_def.items(): 31 | if callable(param): 32 | self.params[name] = param 33 | else: 34 | if isinstance(param, list): 35 | low, high = param 36 | self.params[name] = Float(low=low, high=high) 37 | else: 38 | self.params[name] = Const(value=param) 39 | 40 | def suggest_hparams(self, trial: Trial) -> dict[str, float | int | str | list[float | int | str]]: 41 | """ 42 | Suggests hyperparameters for a given trial. 43 | This method generates a dictionary of hyperparameters by applying the 44 | parameter functions stored in `self.params` to the provided trial. 45 | Args: 46 | trial (Trial): An Optuna trial object used to suggest hyperparameters. 47 | Returns: 48 | dict[str, float | int | str | list[float | int | str]]: A dictionary 49 | where keys are parameter names and values are the suggested 50 | hyperparameters, which can be of type float, int, str, or a list of 51 | these types. 52 | """ 53 | 54 | return {name: param_fn(trial, name) for name, param_fn in self.params.items()} 55 | -------------------------------------------------------------------------------- /examples/hpo/benchmark/README.md: -------------------------------------------------------------------------------- 1 | # Verification Code for NelderMeadSampler Using COCO 2 | 3 | ## 1. File Structure 4 | 5 | ### nelder-mead 6 | ### nelder-mead-subTPE 7 | ### TPE 8 | 9 | - These directories store the CSV files containing the optimization results for each sampler. 10 | 11 | ### experiment_coco.py 12 | 13 | - This is the main code for validation using COCO. 14 | - It is designed to run with dimensions * 20 steps and 10 parallel executions. 15 | - Upon execution, the results from Optuna are output to `optuna_csv`, and results for each parallel step are output to `step_csv`. 16 | 17 | ### main_parallel_coco.py 18 | 19 | - This code uses `job_dispatcher` to submit jobs for each sampler, function, and dimension. 20 | 21 | ### objective.sh 22 | 23 | - This is a script for qsub submission used by `job_dispatcher`. 24 | 25 | ### plot.py 26 | 27 | - This code uses matplotlib to graph the results of each sampler. 28 | - It references the `optuna_csv` files in each sampler's directory. 29 | 30 | ### result_bbob_dim_vs_value-fopt_parallel.png 31 | 32 | - This is a graph image visualizing the validation results output by running `plot.py`. 33 | - The graph displays 24 benchmark functions with the number of dimensions on the horizontal axis and the mean and standard deviation of optimization results on the vertical axis. 34 | 35 | ## 2. Execution Instructions 36 | 37 | - Install aiaccel and activate the virtual environment. 38 | 39 | - Install COCO. 40 | - For details, please refer to the following GitHub repository: 41 | https://github.com/numbbo/coco 42 | 43 | - Please replace the # activate environment and the `job_group` in `job_config.py` with the appropriate commands and IDs. 44 | - When you run above command on ABCI, the validation for each sampler will be executed. 45 | 46 | ``` 47 | aiaccel-job pbs --config job_config.yaml cpu --walltime 4:00:00 main_parallel_coco.log -- python3.13 main_parallel_coco.py 48 | ``` 49 | 50 | - The results are saved in `optuna_csv` and `step_csv` under each directory. 51 | 52 | - To run `plot.py`, you need to install pandas and matplotlib. 53 | 54 | ``` 55 | pip install pandas matplotlib 56 | python plot.py 57 | ``` 58 | 59 | ## 3. Checking the Results 60 | 61 | - The validation results for each sampler are output to `optuna_csv` and `step_csv` under the corresponding sampler's directory. 62 | - The visualization results from `plot.py` are output to `result_bbob_dim_vs_value-fopt_parallel.png`. 63 | - From the visualization results, it can be observed that `nelder-mead_subTPE` tends to yield better results during parallel execution. However, in some cases, `nelder-mead` may perform better depending on the function. 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # tentative 2 | /pixi.lock 3 | 4 | # examples 5 | miniforge3/ 6 | Miniforge*.sh 7 | data/ 8 | checkpoints/ 9 | events* 10 | hparams.yaml 11 | merged_config.yaml 12 | trial*.json 13 | *.db 14 | 15 | # Byte-compiled / optimized / DLL files 16 | __pycache__/ 17 | *.py[cod] 18 | *$py.class 19 | 20 | # C extensions 21 | *.so 22 | 23 | # Distribution / packaging 24 | .Python 25 | build/ 26 | develop-eggs/ 27 | dist/ 28 | downloads/ 29 | eggs/ 30 | .eggs/ 31 | # lib/ 32 | lib64/ 33 | parts/ 34 | sdist/ 35 | var/ 36 | wheels/ 37 | *.egg-info/ 38 | .installed.cfg 39 | *.egg 40 | MANIFEST 41 | 42 | # PyInstaller 43 | # Usually these files are written by a python script from a template 44 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 45 | *.manifest 46 | *.spec 47 | 48 | # Installer logs 49 | pip-log.txt 50 | pip-delete-this-directory.txt 51 | 52 | # Unit test / coverage reports 53 | htmlcov/ 54 | .tox/ 55 | .coverage 56 | .coverage.* 57 | .cache 58 | nosetests.xml 59 | coverage.xml 60 | *.cover 61 | .hypothesis/ 62 | .pytest_cache/ 63 | 64 | # Translations 65 | *.mo 66 | *.pot 67 | 68 | # Django stuff: 69 | *.log 70 | local_settings.py 71 | db.sqlite3 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | target/ 85 | 86 | # Jupyter Notebook 87 | .ipynb_checkpoints 88 | 89 | # pyenv 90 | .python-version 91 | 92 | # celery beat schedule file 93 | celerybeat-schedule 94 | 95 | # SageMath parsed files 96 | *.sage.py 97 | 98 | # Environments 99 | .env 100 | .venv 101 | env/ 102 | venv/ 103 | ENV/ 104 | env.bak/ 105 | venv.bak/ 106 | 107 | # Spyder project settings 108 | .spyderproject 109 | .spyproject 110 | 111 | # Rope project settings 112 | .ropeproject 113 | 114 | # mkdocs documentation 115 | /site 116 | 117 | # mypy 118 | .mypy_cache/ 119 | 120 | # application workspace 121 | /work/ 122 | work_aiaccel/ 123 | 124 | # vscode 125 | .vscode 126 | *.code-workspace 127 | 128 | # document 129 | docs/build/html 130 | docs/source/api_reference/ 131 | !docs/source/api_reference/index.rst 132 | !docs/source/api_reference/torch.rst 133 | !docs/source/api_reference/hpo.rst 134 | 135 | # IntelliJ 136 | .idea 137 | 138 | # Docker 139 | .devcontainer 140 | Dockerfile 141 | 142 | # ruff 143 | .ruff_cache 144 | 145 | # example 146 | examples/hpo/samplers/coco/exdata 147 | examples/hpo/samplers/coco/*/optuna_csv 148 | examples/hpo/samplers/coco/*/step_csv 149 | 150 | # pixi environments 151 | .pixi/* 152 | !.pixi/config.toml 153 | -------------------------------------------------------------------------------- /examples/hpo/basic/README.md: -------------------------------------------------------------------------------- 1 | # Basic usage of aiaccel-hpo 2 | ## Getting started 3 | ```bash 4 | aiaccel-hpo optimize params.x1=[0,1] params.x2=[0,1] n_trials=100 -- python objective.py --x1={x1} --x2={x2} {out_filename} 5 | ``` 6 | A workspace `aiaccel-hpo_***` will be created, and you will get outputs something like: 7 | ```plain 8 | [I 2025-08-11 23:19:09,865] A new study created in RDB with name: aiaccel-hpo 9 | [I 2025-08-11 23:19:10,159] Trial 3 finished with value: 1.199387 and parameters: {'x1': 0.7651250790017732, 'x2': 0.7835626174783031}. Best is trial 3 with value: 1.199387. 10 | [I 2025-08-11 23:19:10,179] Trial 7 finished with value: 0.13314 and parameters: {'x1': 0.28734908107070123, 'x2': 0.22487902368959145}. Best is trial 7 with value: 0.13314. 11 | [I 2025-08-11 23:19:10,190] Trial 6 finished with value: 0.854472 and parameters: {'x1': 0.5282599103748785, 'x2': 0.7585598197415366}. Best is trial 7 with value: 0.13314. 12 | [I 2025-08-11 23:19:10,202] Trial 1 finished with value: 0.241872 and parameters: {'x1': 0.490180501594382, 'x2': 0.03993315068224257}. Best is trial 7 with value: 0.13314. 13 | [I 2025-08-11 23:19:10,215] Trial 0 finished with value: 0.267713 and parameters: {'x1': 0.1980697319379605, 'x2': 0.4779975949100864}. Best is trial 7 with value: 0.13314. 14 | [I 2025-08-11 23:19:10,225] Trial 4 finished with value: 0.223939 and parameters: {'x1': 0.42026494162838846, 'x2': 0.2175229115138555}. Best is trial 7 with value: 0.13314. 15 | [I 2025-08-11 23:19:10,238] Trial 2 finished with value: 1.099494 and parameters: {'x1': 0.8241694903158984, 'x2': 0.6482584131495605}. Best is trial 7 with value: 0.13314. 16 | [I 2025-08-11 23:19:10,249] Trial 5 finished with value: 0.489583 and parameters: {'x1': 0.6165836750706742, 'x2': 0.33076842412638574}. Best is trial 7 with value: 0.13314. 17 | [I 2025-08-11 23:19:10,259] Trial 8 finished with value: 1.624779 and parameters: {'x1': 0.9942998803639703, 'x2': 0.7975879798801359}. Best is trial 7 with value: 0.13314. 18 | [I 2025-08-11 23:19:10,273] Trial 9 finished with value: 1.497936 and parameters: {'x1': 0.9652436480683448, 'x2': 0.7524894798074183}. Best is trial 7 with value: 0.13314. 19 | ``` 20 | 21 | You can also run the optimization by specifying a config file: 22 | ```bash 23 | aiaccel-hpo optimize --config experiment/config.yaml 24 | ``` 25 | 26 | In this case, `experiment/` is used as a workspace. 27 | 28 | You can also combine `aiaccel-hpo` with `aiaccel-job` as: 29 | ```bash 30 | aiaccel-hpo optimize params.x1=[0,1] params.x2=[0,1] n_trials=100 n_max_jobs=10 -- \ 31 | aiaccel-job local cpu {config.working_directory}/{job_name}.log -- \ 32 | python objective.py --x1={x1} --x2={x2} {out_filename} 33 | ``` -------------------------------------------------------------------------------- /docs/source/user_guide/torch.rst: -------------------------------------------------------------------------------- 1 | ########################## 2 | Training a PyTorch Model 3 | ########################## 4 | 5 | ***************** 6 | Getting Started 7 | ***************** 8 | 9 | Aiaccel-based training is a wrapper of PyTorch Lightning, which can be executed as 10 | follows: 11 | 12 | .. code-block:: bash 13 | 14 | python -m aiaccel.torch.apps.train config.yaml 15 | 16 | The config file `config.yaml` typically consists of `trainer`, `datamodule`, and `task` 17 | as follows: 18 | 19 | .. code-block:: yaml 20 | :caption: config.yaml 21 | :linenos: 22 | 23 | _base_: ${resolve_pkg_path:aiaccel.torch.apps.config}/train_base.yaml 24 | 25 | trainer: 26 | max_epochs: 10 27 | 28 | callbacks: 29 | - _target_: lightning.pytorch.callbacks.ModelCheckpoint 30 | filename: "{epoch:04d}" 31 | save_last: True 32 | save_top_k: -1 33 | 34 | datamodule: 35 | _target_: aiaccel.torch.lightning.datamodules.SingleDataModule 36 | 37 | train_dataset_fn: 38 | _partial_: True 39 | _target_: torchvision.datasets.MNIST 40 | 41 | root: "./dataset" 42 | train: True 43 | download: True 44 | 45 | transform: 46 | _target_: torchvision.transforms.Compose 47 | transforms: 48 | - _target_: torchvision.transforms.Resize 49 | size: [[256, 256]] 50 | - _target_: torchvision.transforms.Grayscale 51 | num_output_channels: 3 52 | - _target_: torchvision.transforms.ToTensor 53 | - _target_: torchvision.transforms.Normalize 54 | mean: [0.5] 55 | std: [0.5] 56 | 57 | val_dataset_fn: 58 | _partial_: True 59 | _inherit_: ${datamodule.train_dataset_fn} 60 | 61 | train: False 62 | 63 | batch_size: 128 64 | wrap_scatter_dataset: False 65 | 66 | task: 67 | _target_: my_task.MyTask 68 | num_classes: 10 69 | 70 | model: 71 | _target_: torchvision.models.resnet50 72 | weights: 73 | _target_: hydra.utils.get_object 74 | path: torchvision.models.ResNet50_Weights.DEFAULT 75 | 76 | optimizer_config: 77 | _target_: aiaccel.torch.lightning.OptimizerConfig 78 | optimizer_generator: 79 | _partial_: True 80 | _target_: torch.optim.Adam 81 | lr: 1.e-4 82 | 83 | ********************** 84 | Distributed Training 85 | ********************** 86 | 87 | WIP... 88 | 89 | ***************** 90 | Other Utilities 91 | ***************** 92 | 93 | Other utilities are listed in :doc:`API Reference <../api_reference/torch>`. 94 | -------------------------------------------------------------------------------- /examples/torch/image_classification/recipes/resnet50.cifar10/config.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | _target_: lightning.Trainer 3 | default_root_dir: ${working_directory} 4 | 5 | max_epochs: 200 6 | precision: bf16-mixed 7 | 8 | devices: 1 9 | 10 | logger: 11 | _target_: lightning.pytorch.loggers.CSVLogger 12 | save_dir: ${working_directory} 13 | name: "" 14 | prefix: "" 15 | 16 | 17 | callbacks: 18 | - _target_: lightning.pytorch.callbacks.ModelCheckpoint 19 | save_top_k: 1 20 | monitor: "validation/accuracy" 21 | mode: "max" 22 | filename: "epoch={epoch:04d}_score={validation/accuracy:.2f}" 23 | 24 | - _target_: lightning.pytorch.callbacks.LearningRateMonitor 25 | logging_interval: "step" 26 | 27 | - _target_: lightning.pytorch.callbacks.RichProgressBar 28 | refresh_rate: 5 29 | - _target_: lightning.pytorch.callbacks.RichModelSummary 30 | max_depth: 3 31 | 32 | 33 | datamodule: 34 | _target_: aiaccel.torch.lightning.datamodules.SingleDataModule 35 | 36 | train_dataset_fn: 37 | _partial_: true 38 | _target_: torchvision.datasets.CIFAR10 39 | train: True 40 | transform: 41 | _target_: torchvision.transforms.Compose 42 | transforms: 43 | - _target_: torchvision.transforms.RandomCrop 44 | size: 32 45 | padding: 4 46 | - _target_: torchvision.transforms.RandomHorizontalFlip 47 | - _target_: torchvision.transforms.ToTensor 48 | - _target_: torchvision.transforms.Normalize 49 | mean: [0.4914, 0.4822, 0.4465] 50 | std: [0.2023, 0.1994, 0.2010] 51 | 52 | val_dataset_fn: 53 | _partial_: true 54 | _target_: torchvision.datasets.CIFAR10 55 | train: False 56 | transform: 57 | _target_: torchvision.transforms.Compose 58 | transforms: 59 | - _target_: torchvision.transforms.ToTensor 60 | - _target_: torchvision.transforms.Normalize 61 | mean: [0.4914, 0.4822, 0.4465] 62 | std: [0.2023, 0.1994, 0.2010] 63 | 64 | common_args: 65 | root: "./data" 66 | download: True 67 | 68 | use_scatter: False 69 | batch_size: 256 70 | num_workers: 24 71 | 72 | task: 73 | _target_: image_classification.task.ImageClassificationTask 74 | num_classes: 10 75 | 76 | model: 77 | _target_: image_classification.small_resnet50.SmallResNet50 78 | num_classes: 10 79 | 80 | optimizer_config: 81 | _target_: aiaccel.torch.lightning.OptimizerConfig 82 | optimizer_generator: 83 | _partial_: True 84 | _target_: torch.optim.Adam 85 | lr: 1.e-3 86 | scheduler_generator: 87 | _partial_: True 88 | _target_: torch.optim.lr_scheduler.CosineAnnealingLR 89 | T_max: 200 90 | 91 | scheduler_interval: epoch 92 | -------------------------------------------------------------------------------- /aiaccel/job/apps/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from typing import cast 5 | 6 | from argparse import ArgumentParser, _SubParsersAction 7 | from importlib import resources 8 | import os 9 | from pathlib import Path 10 | 11 | from omegaconf import DictConfig 12 | 13 | from aiaccel.config import prepare_config, print_config, setup_omegaconf 14 | 15 | setup_omegaconf() 16 | 17 | 18 | def prepare_argument_parser( 19 | default_config_name: str, 20 | ) -> tuple[DictConfig, ArgumentParser, _SubParsersAction]: # type: ignore 21 | parser = ArgumentParser(add_help=False) 22 | parser.add_argument("--print_config", action="store_true") 23 | parser.add_argument("--config", type=Path, default=None) 24 | args, _ = parser.parse_known_args() 25 | 26 | args.config = Path( 27 | args.config 28 | or os.environ.get("AIACCEL_JOB_CONFIG") 29 | or (Path(str(resources.files(__package__) / "config")) / default_config_name) 30 | ) # type: ignore 31 | 32 | config = cast(DictConfig, prepare_config(args.config)) 33 | 34 | if args.print_config: 35 | print_config(config) 36 | 37 | parser = ArgumentParser() 38 | parser.add_argument("--print_config", action="store_true") 39 | parser.add_argument("--config", type=Path) 40 | sub_parsers = parser.add_subparsers(dest="mode", required=True) 41 | 42 | parent_parser = ArgumentParser(add_help=False) 43 | parent_parser.add_argument("--walltime", type=str, default=config.walltime) 44 | parent_parser.add_argument("log_filename", type=Path) 45 | parent_parser.add_argument("command", nargs="+") 46 | 47 | sub_parser = sub_parsers.add_parser("cpu", parents=[parent_parser]) 48 | sub_parser.add_argument("--n_tasks", type=int) 49 | sub_parser.add_argument("--n_tasks_per_proc", type=int, default=config["cpu-array"].n_tasks_per_proc) 50 | sub_parser.add_argument("--n_procs", type=int, default=config["cpu-array"].n_procs) 51 | 52 | sub_parser = sub_parsers.add_parser("gpu", parents=[parent_parser]) 53 | sub_parser.add_argument("--n_tasks", type=int) 54 | sub_parser.add_argument("--n_tasks_per_proc", type=int, default=config["gpu-array"].n_tasks_per_proc) 55 | sub_parser.add_argument("--n_procs", type=int, default=config["gpu-array"].n_procs) 56 | 57 | sub_parser = sub_parsers.add_parser("mpi", parents=[parent_parser]) 58 | sub_parser.add_argument("--n_procs", type=int, required=True) 59 | sub_parser.add_argument("--n_nodes", type=int, default=config["mpi"].n_nodes) 60 | 61 | sub_parser = sub_parsers.add_parser("train", parents=[parent_parser]) 62 | sub_parser.add_argument("--n_gpus", type=int) 63 | 64 | return config, parser, sub_parsers 65 | -------------------------------------------------------------------------------- /docs/source/contribution_guide/coding_styles.md: -------------------------------------------------------------------------------- 1 | (coding-style)= 2 | # Coding styles 3 | 4 | ## Basic rules 5 | 6 | - Write source codes for aiaccel in Python. 7 | - Coding style should follow PEP8. 8 | - Validate the coding style by using pycodestyle and flake8 in aiaccel. 9 | - See also Docstrings below. 10 | - Write type hints whenever possible, but there is no type hint validation in aiaccel. 11 | - When using a built-in, e.g. `list` as a type hint, run future-import to support Python 3.8 in aiaccel. 12 | - Use [`numpy.random.RandomState`](https://numpy.org/doc/1.16/reference/generated/numpy.random.RandomState.html) to generate a random value and maintain the compatibility with [Optuna](https://github.com/optuna/optuna) used by aiaccel. 13 | 14 | ## Docstrings 15 | 16 | Basically, write docstrings in accordance with the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings). However, please note the following exceptions. 17 | 18 | - Docstrings for each module are not necessarily required. 19 | - In `Args:` section, a parameter type is described in parentheses after a parameter name. 20 | - Add `Example:` section as needed. 21 | - Include the docstring of `__init__` method in a docstring of class. Do not write it under `__init__`. 22 | - Use sphinx-style links to Python objects. 23 | - Using VSCode as an editor, [autoDocstring](https://marketplace.visualstudio.com/items?itemName=njpwerner.autodocstring) is useful for generating docstrings. 24 | 25 | ### Example 26 | 27 | ```python 28 | class ExampleClass: 29 | """Summary of class. 30 | 31 | There can be additional description(s) of this class. 32 | 33 | Args: 34 | param1 (type_of_param1): Description of `param1` which 35 | is given when __init__ method is called. 36 | param2 (type_of_param2): Description of `param2`. 37 | 38 | Attributions: 39 | param1 (type_of_param1): Description of `param1`. 40 | param2 (type_of_param2): Description of `param2`. 41 | param3 (type_of_param3): Description of 'param3`. 42 | """ 43 | 44 | def __init__(self, param1: type_of_param1, param2: type_of_param2): 45 | self.param1 = param1 46 | self.param2 = param2 47 | self.param3 = generate_param3() 48 | 49 | def method(self, arg1: type_of_arg1) -> type_of_return: 50 | """Recieves `type_of_arg1` object and returns return_of_method. 51 | 52 | Args: 53 | arg1 (type_of_arg1): Description of `arg1`. 54 | 55 | Returns: 56 | type_of_return: Description of return value. If this method 57 | returns nothing, this section can be omitted. 58 | 59 | Raise: 60 | TypeOfException: Description of Exception. 61 | 62 | """ 63 | ... 64 | return return_of_method 65 | 66 | ``` 67 | -------------------------------------------------------------------------------- /aiaccel/torch/lightning/ckpt.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from typing import Any 5 | 6 | import logging 7 | from pathlib import Path 8 | 9 | from hydra.utils import instantiate 10 | from omegaconf import DictConfig, ListConfig 11 | 12 | from torch import nn 13 | 14 | from huggingface_hub import snapshot_download 15 | 16 | from aiaccel.config import prepare_config 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | def load_checkpoint( 22 | model_path: str | Path, 23 | config_name: str = "merged_config.yaml", 24 | device: str = "cuda", 25 | overwrite_config: DictConfig | ListConfig | dict[Any, Any] | list[Any] | None = None, 26 | ) -> tuple[nn.Module, DictConfig | ListConfig]: 27 | """ 28 | Load a PyTorch Lightning model from a pre-trained checkpoint. 29 | 30 | This function loads a model from a specified path, which can be a local directory 31 | or a Hugging Face repository. It also loads the associated configuration file and 32 | allows for optional configuration overrides. The model can be set to evaluation mode 33 | if specified. 34 | 35 | Args: 36 | model_path (str | Path): The path to the model directory or Hugging Face repo. 37 | For local paths, use the format "file://" or just the path (str | Path). 38 | For Hugging Face, use the format "hf://". 39 | config_name (str): The name of the configuration file to load. Default is "merged_config.yaml". 40 | device (str): The device to map the model to. Default is "cuda". 41 | overwrite_config (DictConfig | ListConfig | dict | list | None): Optional configuration overrides. 42 | """ 43 | 44 | if isinstance(model_path, str): 45 | if model_path.startswith("hf://"): 46 | logger.info("Downloading model from Hugging Face...") 47 | hf_path = model_path.removeprefix("hf://") 48 | model_path = Path(snapshot_download(hf_path)) 49 | elif model_path.startswith("file://"): 50 | model_path = Path(model_path.removeprefix("file://")) 51 | else: 52 | model_path = Path(model_path) 53 | 54 | config_path = model_path / config_name 55 | config = prepare_config(config_path, overwrite_config=overwrite_config) 56 | 57 | checkpoint_filename = config.checkpoint_filename if "checkpoint_filename" in config else "last.ckpt" 58 | checkpoint_path = model_path / "checkpoints" / checkpoint_filename 59 | 60 | logger.info(f"Loading model from {checkpoint_path}...") 61 | 62 | config.task._target_ += ".load_from_checkpoint" 63 | model = instantiate( 64 | config.task, 65 | checkpoint_path=checkpoint_path, 66 | map_location=device, 67 | ) 68 | 69 | return model, config 70 | -------------------------------------------------------------------------------- /docs/source/contribution_guide/pull_requests.md: -------------------------------------------------------------------------------- 1 | # Pull Requests 2 | 3 | When you want to the modified code to be reflected in the repository, please execute a pull request. 4 | 5 | ## Procedures 6 | 7 | - Please fork aiaccel repository on GitHub. 8 | - After forking, run `git clone` command for aiaccel repository. 9 | 10 | ~~~bash 11 | git clone https://github.com/[YOUR USERNAME]/aiaccel.git 12 | ~~~ 13 | 14 | ## Developments 15 | - Update a local repository to the latest version. 16 | 17 | ~~~bash 18 | git checkout main 19 | git pull upstream main 20 | ~~~ 21 | 22 | - Make a branch. 23 | 24 | ~~~bash 25 | git checkout -b feature/add-new-feature 26 | ~~~ 27 | 28 | - Commit on local by using `git add` and `git commit` command as you progress. 29 | 30 | - The commit message describes the motivation for the change, the nature of the bug, or details the enhancement. 31 | - The message should be written in such a way that their contents can be understood without refering code. 32 | 33 | 34 | ## Submitting 35 | 36 | Before submitting a pull request, confirm the following: 37 | - Did you discuss it with other developer on issues in advance? 38 | - Can it be distributed under the MIT licence? 39 | - Is there appropriate [unit tests](#test)? 40 | - Can the [unit tests](#test) be run on local? 41 | - Does the public function have a docstring? 42 | - Can the [documentation](#documentation-wip) be rendered correctly? 43 | - Is the [coding style](#coding-style) appropriate? 44 | - Is the commit message appropriate? 45 | - For larger commit, please provide the example (docs/source/examples) and the description of module level. 46 | - If you are adding complied codes, have you modified setup.py? 47 | 48 | After confirming above, do following: 49 | - Push changes to the fork on GitHub. 50 | 51 | ~~~bash 52 | git push origin feature/add-new-feature 53 | ~~~ 54 | 55 | - Enter your GitHub username and password. 56 | - Move to the GitHub web page and write the title and message, noting the following. 57 | - Title 58 | - Briefly describe the changes. 59 | - Codes should be enclosed in backquotes. 60 | - Do not end with a period. 61 | - Descriptions 62 | - Write the motivation. 63 | - Write the changes. 64 | - If the related issues can be closed, please close it with `Close #N`. 65 | - If work-in-progress, write the remaining tasks. 66 | - Submit the pull request. 67 | 68 | # Review processes 69 | 70 | - Other developers can contribute comments to improve implementations, documents, and coding styles in the pull request. 71 | - When updating codes in the pull request, please commit the changes in the local repository and push the changes to the fork only if they have been successfully tested in the local environment. 72 | - If the pull request has been reviewed and approved by at least one member of the aiaccel development team, it will be merged into the main branch. 73 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | # Configuration file for the Sphinx documentation builder. 5 | # Full options: https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | from pathlib import Path 8 | import sys 9 | 10 | import aiaccel 11 | 12 | # -- Path setup -------------------------------------------------------------- 13 | root_path = Path(__file__).parent.parent.parent 14 | sys.path.insert(0, str(root_path.absolute())) 15 | 16 | # -- Project information ----------------------------------------------------- 17 | project = "aiaccel" 18 | author = "National Institute of Advanced Industrial Science And Technology (AIST)" 19 | project_copyright = author 20 | release = aiaccel.__version__ 21 | 22 | html_logo = f"{root_path}/docs/image/logo_aiaccel.png" 23 | html_favicon = f"{root_path}/docs/image/favicon.ico" 24 | 25 | # -- General configuration --------------------------------------------------- 26 | extensions = [ 27 | "sphinx.ext.autosummary", 28 | "sphinx.ext.autodoc", 29 | "sphinx.ext.githubpages", 30 | "sphinx.ext.napoleon", 31 | "sphinx.ext.todo", 32 | "sphinx.ext.viewcode", 33 | "sphinx.ext.doctest", 34 | "sphinx_design", 35 | "sphinx_copybutton", 36 | "myst_parser", 37 | ] 38 | 39 | templates_path = ["_templates"] 40 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 41 | 42 | source_suffix = { 43 | ".rst": "restructuredtext", 44 | ".txt": "markdown", 45 | ".md": "markdown", 46 | } 47 | 48 | language = "en" 49 | 50 | # -- HTML output ------------------------------------------------------------- 51 | html_theme = "shibuya" 52 | html_show_sourcelink = False 53 | html_show_sphinx = False 54 | 55 | html_static_path = ["_static"] 56 | html_theme_options = { 57 | "accent_color": "gold", 58 | "nav_links": [ 59 | {"title": "User Guide", "url": "user_guide/index"}, 60 | { 61 | "title": "API Reference", 62 | "url": "api_reference/index", 63 | "children": [ 64 | { 65 | "title": "OmegaConf Utilities", 66 | "url": "api_reference/config", 67 | "summary": "aiaccel.config", 68 | }, 69 | { 70 | "title": "PyTorch/Lightning Toolkit", 71 | "url": "api_reference/torch", 72 | "summary": "aiaccel.torch", 73 | }, 74 | { 75 | "title": "Hyperparameter Optimization", 76 | "url": "api_reference/hpo", 77 | "summary": "aiaccel.hpo", 78 | }, 79 | ], 80 | }, 81 | {"title": "Contribution Guide", "url": "contribution_guide/index"}, 82 | ], 83 | "github_url": "https://github.com/aistairc/aiaccel", 84 | "globaltoc_expand_depth": 1, 85 | } 86 | -------------------------------------------------------------------------------- /.github/workflows/pypi-publish.yaml: -------------------------------------------------------------------------------- 1 | name: Upload aiaccel package to PyPI 2 | 3 | on: 4 | schedule: 5 | - cron: '0 0 1 * *' 6 | workflow_dispatch: 7 | 8 | concurrency: 9 | group: pypi-publish 10 | cancel-in-progress: false 11 | 12 | jobs: 13 | check_commits: 14 | runs-on: ubuntu-latest 15 | outputs: 16 | has_changes: ${{ steps.check_commits.outputs.has_changes }} 17 | version: ${{ steps.get_version.outputs.version }} 18 | tag: ${{ steps.get_version.outputs.tag }} 19 | 20 | steps: 21 | - uses: actions/checkout@v4 22 | with: 23 | fetch-depth: 0 24 | fetch-tags: true 25 | 26 | - name: Check for commits since last tag 27 | id: check_commits 28 | run: | 29 | LAST_TAG=$(git tag --list 'v*' --sort=-v:refname | head -n 1) 30 | echo "Last tag: ${LAST_TAG:-}" 31 | if [ -z "$LAST_TAG" ] || git log "${LAST_TAG}.." --pretty=oneline | grep .; then 32 | echo "has_changes=true" >> $GITHUB_OUTPUT 33 | else 34 | echo "has_changes=false" >> $GITHUB_OUTPUT 35 | fi 36 | 37 | - name: Get current [year].[month](.[patch]) version 38 | id: get_version 39 | run: | 40 | BASE=$(date '+%Y.%-m') 41 | LAST=$(git tag -l "v${BASE}*" --sort=-v:refname | head -n1) 42 | 43 | case "$LAST" in 44 | "") VERSION=$BASE.0 ;; 45 | *) VERSION=$BASE.$(( ${LAST##*.}+1 )) ;; 46 | esac 47 | 48 | echo "version=$VERSION" >>"$GITHUB_OUTPUT" 49 | echo "tag=v$VERSION" >>"$GITHUB_OUTPUT" 50 | 51 | publish-if-needed: 52 | needs: check_commits 53 | if: needs.check_commits.outputs.has_changes == 'true' 54 | runs-on: ubuntu-latest 55 | env: 56 | PIP_INDEX_URL: https://download.pytorch.org/whl/cpu 57 | PIP_EXTRA_INDEX_URL: https://pypi.org/simple 58 | permissions: 59 | contents: write 60 | id-token: write 61 | 62 | steps: 63 | - uses: actions/checkout@v4 64 | with: 65 | fetch-depth: 0 66 | fetch-tags: true 67 | 68 | - name: Set up Python 69 | uses: actions/setup-python@v5 70 | with: 71 | cache: 'pip' 72 | cache-dependency-path: pyproject.toml 73 | 74 | - name: Install dependencies 75 | run: | 76 | pip install -e .[dev,github-actions] 77 | 78 | - name: Update and push tag 79 | run: | 80 | git config user.name "github-actions[bot]" 81 | git config user.email "41898282+github-actions[bot]@users.noreply.github.com" 82 | git tag ${{ needs.check_commits.outputs.tag }} 83 | git push origin ${{ needs.check_commits.outputs.tag }} 84 | 85 | - name: Build package 86 | run: | 87 | hatch build 88 | 89 | - name: Publish to PyPI 90 | uses: pypa/gh-action-pypi-publish@release/v1 91 | 92 | - name: Create GitHub Release 93 | env: 94 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 95 | run: | 96 | gh release create ${{ needs.check_commits.outputs.tag }} \ 97 | --title ${{ needs.check_commits.outputs.tag }} \ 98 | --notes "Release created automatically by GitHub Actions" 99 | -------------------------------------------------------------------------------- /aiaccel/job/apps/slurm.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | 4 | import os 5 | from pathlib import Path 6 | import shlex 7 | import subprocess 8 | import time 9 | 10 | from aiaccel.job.apps import prepare_argument_parser 11 | 12 | 13 | def main() -> None: 14 | # Load configuration (from the default YAML string) 15 | config, parser, sub_parsers = prepare_argument_parser("slurm.yaml") 16 | 17 | args = parser.parse_args() 18 | mode = args.mode + "-array" if getattr(args, "n_tasks", None) is not None else args.mode 19 | 20 | # Prepare the job script and arguments 21 | job = config[mode].job.format(command=shlex.join(args.command), args=args) 22 | 23 | if mode in ["cpu-array", "gpu-array"]: 24 | job = f"""\ 25 | for LOCAL_PROC_INDEX in {{1..{args.n_procs}}}; do 26 | TASK_INDEX=$(( SLURM_ARRAY_TASK_ID + {args.n_tasks_per_proc} * (LOCAL_PROC_INDEX - 1) )) 27 | 28 | if [[ $TASK_INDEX -gt {args.n_tasks} ]]; then 29 | break 30 | fi 31 | 32 | TASK_INDEX=$TASK_INDEX \\ 33 | TASK_STEPSIZE={args.n_tasks_per_proc} \\ 34 | {job} > {args.log_filename.with_suffix("")}.${{SLURM_ARRAY_TASK_ID}}-${{LOCAL_PROC_INDEX}}.log 2>&1 & 35 | 36 | pids[$LOCAL_PROC_INDEX]=$! 37 | done 38 | 39 | for i in "${{!pids[@]}}"; do 40 | wait ${{pids[$i]}} 41 | done 42 | """ 43 | job_log_filename = args.log_filename.with_suffix(".%a.log") 44 | job_status_filename: Path = args.log_filename.with_suffix(".${SLURM_ARRAY_TASK_ID}.out") 45 | 46 | status_filename_list = [] 47 | for array_idx in range(0, args.n_tasks, args.n_tasks_per_proc * args.n_procs): 48 | status_filename_list.append(args.log_filename.with_suffix(f".{array_idx + 1}.out")) 49 | else: 50 | job_log_filename = args.log_filename.resolve() 51 | job_status_filename = args.log_filename.with_suffix(".out").resolve() 52 | 53 | status_filename_list = [job_status_filename] 54 | 55 | job_script = f"""\ 56 | #! /bin/bash 57 | #SBATCH -o {job_log_filename} 58 | #SBATCH -t {args.walltime} 59 | 60 | set -eE -o pipefail 61 | trap 'echo $? > {job_status_filename}' ERR EXIT # at error and exit 62 | 63 | 64 | {config.script_prologue} 65 | 66 | {job} 67 | """ 68 | 69 | sbatch = config.sbatch.format(args=args) 70 | sbatch_args = config[mode].sbatch_args.format(args=args) 71 | 72 | # Create the job script file, remove old status files, and run the job 73 | args.log_filename.parent.mkdir(exist_ok=True, parents=True) 74 | 75 | job_filename: Path = args.log_filename.with_suffix(".sh") 76 | with open(job_filename, "w") as f: 77 | f.write(job_script) 78 | 79 | for status_filename in status_filename_list: 80 | status_filename.unlink(missing_ok=True) 81 | 82 | subprocess.run(f"{sbatch} {sbatch_args} {job_filename}", shell=True, check=True) 83 | 84 | for status_filename in status_filename_list: 85 | while not status_filename.exists(): 86 | time.sleep(1.0) 87 | 88 | if config.get("use_scandir", False): # Reflesh the file system if needed 89 | os.scandir(status_filename.parent) 90 | 91 | status = int(status_filename.read_text()) 92 | if status != 0: 93 | raise RuntimeError(f"Job failed with {status} exit code.") 94 | status_filename.unlink() 95 | 96 | 97 | if __name__ == "__main__": 98 | main() 99 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling", "hatch-vcs"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "aiaccel" 7 | description = "AIST Toolkit for Accelerating Machine Learning Research" 8 | readme = "README.md" 9 | requires-python = ">=3.10" 10 | license = {text = "MIT"} 11 | authors = [ 12 | {name = "AIST", email = "onishi-masaki@aist.go.jp"} 13 | ] 14 | classifiers = [ 15 | "License :: OSI Approved :: MIT License", 16 | ] 17 | dependencies = [ 18 | "attrs", 19 | "numpy", 20 | "scipy", 21 | "optuna>=4.5.0", 22 | "omegaconf", 23 | "hydra-core", 24 | "huggingface-hub", 25 | "colorama", 26 | "lightning>=2.2.1", 27 | "torch>=2.2.0", 28 | "h5py", 29 | "rich", 30 | "tensorboard", 31 | "typing_extensions", 32 | "simpleeval" 33 | ] 34 | dynamic = ["version"] 35 | 36 | [tool.hatch.version] 37 | source = "vcs" 38 | 39 | [project.urls] 40 | documentation = "https://aistairc.github.io/aiaccel/" 41 | repository = "https://github.com/aistairc/aiaccel" 42 | 43 | [project.optional-dependencies] 44 | dev = [ 45 | "mypy", 46 | "myst-parser", 47 | "pre-commit", 48 | "pytest", 49 | "pytest-cov", 50 | "pytest-mock", 51 | "pytest-subprocess", 52 | "pytest-xdist", 53 | "ruff", 54 | "sphinx", 55 | "sphinxcontrib-jquery", 56 | "sphinx-autobuild", 57 | "sphinx-intl", 58 | "sphinx-fontawesome", 59 | "sphinx-copybutton", 60 | "shibuya", 61 | "sphinx_design", 62 | "types-colorama", 63 | "types-PyYAML", 64 | "undecorated", 65 | "pandas", 66 | "pandas-stubs", 67 | "matplotlib", 68 | "docstrfmt", 69 | "types-toml", 70 | "hatch", 71 | ] 72 | github-actions = [ 73 | "pytest-github-actions-annotate-failures", 74 | ] 75 | 76 | [project.scripts] 77 | aiaccel-job = "aiaccel.launcher:main" 78 | aiaccel-config = "aiaccel.launcher:main" 79 | aiaccel-torch = "aiaccel.launcher:main" 80 | aiaccel-hpo = "aiaccel.launcher:main" 81 | 82 | 83 | # ruff configurations 84 | [tool.ruff] 85 | line-length = 120 86 | target-version = "py310" 87 | fix = true 88 | 89 | [tool.ruff.lint] 90 | select = ["F", "E", "W", "UP", "B", "SIM", "I", "C", "A", "ERA", "N", "C90"] 91 | 92 | [tool.ruff.lint.per-file-ignores] 93 | "__init__.py" = ["F401"] 94 | 95 | [tool.ruff.lint.isort] 96 | force-sort-within-sections = true 97 | 98 | section-order = [ 99 | "future", 100 | "typing", 101 | "standard-library", 102 | "utilities", 103 | "datascience", 104 | "torch", 105 | "torch-third-party", 106 | "third-party", 107 | "audio", 108 | "first-party", 109 | "local-folder" 110 | ] 111 | 112 | [tool.ruff.lint.isort.sections] 113 | "typing" = ["typing", "typing_extensions", "numpy.typing"] 114 | "utilities" = ["colorama", "hydra", "omegaconf", "progressbar", "rich"] 115 | "datascience" = ["numpy", "scipy", "pandas", "matplotlib", "opt_einsum", "einops"] 116 | "torch" = ["torch"] 117 | "torch-third-party" = ["torchaudio", "torchvision", "auraloss", "lightning", "einops.layers"] 118 | "audio" = ["librosa", "pypesq", "pystoi", "soundfile"] 119 | 120 | 121 | # pixi configurations 122 | [tool.pixi.workspace] 123 | channels = ["conda-forge"] 124 | platforms = ["linux-64"] 125 | 126 | [tool.pixi.dependencies] 127 | python = ">=3.10,<3.11" 128 | 129 | [tool.pixi.pypi-dependencies] 130 | aiaccel = { path = ".", editable = true } 131 | 132 | [tool.pixi.environments] 133 | default = { features = ["dev"] } 134 | 135 | [tool.pixi.tasks] 136 | -------------------------------------------------------------------------------- /aiaccel/torch/datasets/hdf5_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from __future__ import annotations 5 | 6 | from typing import Any 7 | 8 | import json 9 | from pathlib import Path 10 | import pickle as pkl 11 | 12 | import torch 13 | from torch.utils.data import Dataset 14 | 15 | import h5py as h5 16 | 17 | __all__ = [ 18 | "RawHDF5Dataset", 19 | "HDF5Dataset", 20 | ] 21 | 22 | 23 | class RawHDF5Dataset(Dataset[dict[str, Any]]): 24 | """ 25 | A dataset class for reading data from HDF5 files. 26 | 27 | Args: 28 | dataset_path (Union[Path, str]): The path to the HDF5 dataset file. 29 | grp_list (Union[Path, str, List[str], None], optional): The list of groups to load from the dataset. 30 | If None, all groups in the dataset will be loaded. If a string or Path, it should be the path to a file 31 | containing the list of groups. If a list, it should directly specify the groups to load. Defaults to None. 32 | 33 | Raises: 34 | NotImplementedError: If grp_list is of an unsupported type. 35 | 36 | Attributes: 37 | dataset_path (Union[Path, str]): The path to the HDF5 dataset file. 38 | grp_list (List[str]): The list of groups to load from the dataset. 39 | f (Optional[h5.File]): The HDF5 file object used for reading the dataset. 40 | 41 | """ 42 | 43 | def __init__(self, dataset_path: Path | str, grp_list: Path | str | list[str] | None = None) -> None: 44 | self.dataset_path = dataset_path 45 | 46 | if grp_list is None: 47 | with h5.File(self.dataset_path, "r") as f: 48 | self.grp_list = list(f.keys()) 49 | elif isinstance(grp_list, (str | Path)): 50 | grp_list = Path(grp_list) 51 | if grp_list.suffix == ".pkl": 52 | with open(grp_list, "rb") as f: 53 | self.grp_list = pkl.load(f) 54 | elif grp_list.suffix == ".json": 55 | with open(grp_list) as f: 56 | self.grp_list = json.load(f) 57 | elif isinstance(grp_list, list): 58 | self.grp_list = grp_list 59 | else: 60 | raise NotImplementedError() 61 | self.grp_list.sort() 62 | 63 | self.f: h5.File | None = None 64 | 65 | def __len__(self) -> int: 66 | return len(self.grp_list) 67 | 68 | def __getitem__(self, index: int) -> dict[str, Any]: 69 | if self.f is None: 70 | self.f = h5.File(self.dataset_path, "r") 71 | 72 | return {k: v[:] for k, v in self.f[self.grp_list[index]].items()} # type: ignore 73 | 74 | def __del__(self) -> None: 75 | if self.f is not None: 76 | self.f.close() 77 | 78 | 79 | class HDF5Dataset(RawHDF5Dataset): 80 | """ 81 | A dataset class for loading data from an HDF5 file. 82 | 83 | This class extends the `RawHDF5Dataset` class and provides a convenient way to load data from an HDF5 file 84 | and convert it into a dictionary of torch tensors. 85 | 86 | Args: 87 | path (str): The path to the HDF5 file. 88 | transform (callable, optional): A function/transform that takes in a dictionary of data and returns a 89 | modified version. Default is None. 90 | 91 | Returns: 92 | dict[str, torch.Tensor]: A dictionary containing the data loaded from the HDF5 file, where the keys are 93 | the names of the data fields and the values are torch tensors. 94 | """ 95 | 96 | def __getitem__(self, index: int) -> dict[str, torch.Tensor]: 97 | return {k: torch.as_tensor(v) for k, v in super().__getitem__(index).items()} 98 | -------------------------------------------------------------------------------- /aiaccel/job/apps/sge.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | 4 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 5 | # SPDX-License-Identifier: MIT 6 | 7 | import os 8 | from pathlib import Path 9 | import shlex 10 | import subprocess 11 | import time 12 | 13 | from aiaccel.job.apps import prepare_argument_parser 14 | 15 | 16 | def main() -> None: 17 | # Load configuration (from the default YAML string) 18 | config, parser, sub_parsers = prepare_argument_parser("sge.yaml") 19 | 20 | args = parser.parse_args() 21 | mode = args.mode + "-array" if getattr(args, "n_tasks", None) is not None else args.mode 22 | 23 | # Prepare the job script and arguments 24 | job = config[mode].job.format(command=shlex.join(args.command), args=args) 25 | 26 | if mode in ["cpu-array", "gpu-array"]: 27 | job = f"""\ 28 | for LOCAL_PROC_INDEX in {{1..{args.n_procs}}}; do 29 | TASK_INDEX=$(( SGE_TASK_ID + {args.n_tasks_per_proc} * (LOCAL_PROC_INDEX - 1) )) 30 | 31 | if [[ $TASK_INDEX -gt {args.n_tasks} ]]; then 32 | break 33 | fi 34 | 35 | TASK_INDEX=$TASK_INDEX \\ 36 | TASK_STEPSIZE={args.n_tasks_per_proc} \\ 37 | {job} > {args.log_filename.with_suffix("")}.${{SGE_TASK_ID}}-${{LOCAL_PROC_INDEX}}.log 2>&1 & 38 | 39 | pids[$LOCAL_PROC_INDEX]=$! 40 | done 41 | 42 | for i in "${{!pids[@]}}"; do 43 | wait ${{pids[$i]}} 44 | done 45 | """ 46 | job_log_filename = args.log_filename.with_suffix(".$TASK_ID.log") 47 | job_status_filename: Path = args.log_filename.with_suffix(".${SGE_TASK_ID}.out") 48 | 49 | status_filename_list = [] 50 | for array_idx in range(0, args.n_tasks, args.n_tasks_per_proc * args.n_procs): 51 | status_filename_list.append(args.log_filename.with_suffix(f".{array_idx + 1}.out")) 52 | else: 53 | job_log_filename = args.log_filename 54 | job_status_filename = args.log_filename.with_suffix(".out") 55 | 56 | status_filename_list = [job_status_filename] 57 | 58 | job_script = f"""\ 59 | #! /bin/bash 60 | 61 | #$-j y 62 | #$-cwd 63 | #$-o {job_log_filename} 64 | 65 | set -eE -o pipefail 66 | trap 'echo $? > {job_status_filename}' ERR EXIT # at error and exit 67 | trap 'echo 143 > {job_status_filename}' TERM # at termination (by job scheduler) 68 | 69 | if [ -n "$PBS_O_WORKDIR" ] && [ "$PBS_ENVIRONMENT" != "PBS_INTERACTIVE" ]; then 70 | cd $PBS_O_WORKDIR 71 | fi 72 | 73 | 74 | {config.script_prologue} 75 | 76 | {job} 77 | """ 78 | 79 | qsub = config.qsub.format(args=args) 80 | qsub_args = config[mode].qsub_args.format(args=args) 81 | 82 | # Create the job script file, remove old status files, and run the job 83 | args.log_filename.parent.mkdir(exist_ok=True, parents=True) 84 | 85 | job_filename: Path = args.log_filename.with_suffix(".sh") 86 | with open(job_filename, "w") as f: 87 | f.write(job_script) 88 | 89 | for status_filename in status_filename_list: 90 | status_filename.unlink(missing_ok=True) 91 | 92 | subprocess.run(f"{qsub} {qsub_args} {job_filename}", shell=True, check=True) 93 | 94 | for status_filename in status_filename_list: 95 | while not status_filename.exists(): 96 | time.sleep(1.0) 97 | if config.get("use_scandir", False): # Reflesh the file system if needed 98 | os.scandir(status_filename.parent) 99 | 100 | status = int(status_filename.read_text()) 101 | if status != 0: 102 | raise RuntimeError(f"Job failed with {status} exit code.") 103 | status_filename.unlink() 104 | 105 | 106 | if __name__ == "__main__": 107 | main() 108 | -------------------------------------------------------------------------------- /aiaccel/job/apps/pbs.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | 4 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 5 | # SPDX-License-Identifier: MIT 6 | 7 | import os 8 | from pathlib import Path 9 | import shlex 10 | import subprocess 11 | import time 12 | 13 | from aiaccel.job.apps import prepare_argument_parser 14 | 15 | 16 | def main() -> None: 17 | # Load configuration (from the default YAML string) 18 | config, parser, sub_parsers = prepare_argument_parser("pbs.yaml") 19 | 20 | args = parser.parse_args() 21 | mode = args.mode + "-array" if getattr(args, "n_tasks", None) is not None else args.mode 22 | 23 | # Prepare the job script and arguments 24 | job = config[mode].job.format(command=shlex.join(args.command), args=args) 25 | 26 | if mode in ["cpu-array", "gpu-array"]: 27 | job = f"""\ 28 | for LOCAL_PROC_INDEX in {{1..{args.n_procs}}}; do 29 | TASK_INDEX=$(( PBS_ARRAY_INDEX + {args.n_tasks_per_proc} * (LOCAL_PROC_INDEX - 1) )) 30 | 31 | if [[ $TASK_INDEX -gt {args.n_tasks} ]]; then 32 | break 33 | fi 34 | 35 | TASK_INDEX=$TASK_INDEX \\ 36 | TASK_STEPSIZE={args.n_tasks_per_proc} \\ 37 | {job} > {args.log_filename.with_suffix("")}.${{PBS_ARRAY_INDEX}}-${{LOCAL_PROC_INDEX}}.log 2>&1 & 38 | 39 | pids[$LOCAL_PROC_INDEX]=$! 40 | done 41 | 42 | for i in "${{!pids[@]}}"; do 43 | wait ${{pids[$i]}} 44 | done 45 | """ 46 | job_log_filename = args.log_filename.with_suffix(".^array_index^.log") 47 | job_status_filename: Path = args.log_filename.with_suffix(".${PBS_ARRAY_INDEX}.out") 48 | 49 | status_filename_list = [] 50 | for array_idx in range(0, args.n_tasks, args.n_tasks_per_proc * args.n_procs): 51 | status_filename_list.append(args.log_filename.with_suffix(f".{array_idx + 1}.out")) 52 | else: 53 | job_log_filename = args.log_filename 54 | job_status_filename = args.log_filename.with_suffix(".out") 55 | 56 | status_filename_list = [job_status_filename] 57 | 58 | job_script = f"""\ 59 | #! /bin/bash 60 | 61 | #PBS -j oe 62 | #PBS -k oed 63 | #PBS -o {job_log_filename} 64 | 65 | set -eE -o pipefail 66 | trap 'echo $? > {job_status_filename}' ERR EXIT # at error and exit 67 | trap 'echo 143 > {job_status_filename}' TERM # at termination (by job scheduler) 68 | 69 | if [ -n "$PBS_O_WORKDIR" ] && [ "$PBS_ENVIRONMENT" != "PBS_INTERACTIVE" ]; then 70 | cd $PBS_O_WORKDIR 71 | fi 72 | 73 | 74 | {config.script_prologue} 75 | 76 | {job} 77 | """ 78 | 79 | qsub = config.qsub.format(args=args) 80 | qsub_args = config[mode].qsub_args.format(args=args) 81 | 82 | # Create the job script file, remove old status files, and run the job 83 | args.log_filename.parent.mkdir(exist_ok=True, parents=True) 84 | 85 | job_filename: Path = args.log_filename.with_suffix(".sh") 86 | with open(job_filename, "w") as f: 87 | f.write(job_script) 88 | 89 | for status_filename in status_filename_list: 90 | status_filename.unlink(missing_ok=True) 91 | 92 | subprocess.run(f"{qsub} {qsub_args} {job_filename}", shell=True, check=True) 93 | 94 | for status_filename in status_filename_list: 95 | while not status_filename.exists(): 96 | time.sleep(1.0) 97 | 98 | if config.get("use_scandir", False): # Reflesh the file system if needed 99 | os.scandir(status_filename.parent) 100 | 101 | status = int(status_filename.read_text()) 102 | if status != 0: 103 | raise RuntimeError(f"Job failed with {status} exit code.") 104 | status_filename.unlink() 105 | 106 | 107 | if __name__ == "__main__": 108 | main() 109 | -------------------------------------------------------------------------------- /aiaccel/torch/datasets/cached_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from typing import Any, TypeVar 5 | 6 | from multiprocessing import Manager 7 | 8 | import torch 9 | from torch.utils.data import Dataset 10 | 11 | __all__ = ["CachedDataset"] 12 | 13 | 14 | class NumpiedTensor: 15 | """ 16 | A wrapper class that converts a PyTorch tensor to a NumPy array and vice versa. 17 | 18 | Args: 19 | tensor (torch.Tensor): The input PyTorch tensor. 20 | 21 | Attributes: 22 | array (np.ndarray): The NumPy array representation of the tensor. 23 | 24 | Methods: 25 | to_tensor: Converts the NumPy array back to a PyTorch tensor. 26 | """ 27 | 28 | def __init__(self, tensor: torch.Tensor) -> None: 29 | self.array = tensor.numpy() 30 | 31 | def to_tensor(self) -> torch.Tensor: 32 | """ 33 | Converts the NumPy array back to a PyTorch tensor. 34 | 35 | Returns: 36 | torch.Tensor: The PyTorch tensor representation of the NumPy array. 37 | """ 38 | return torch.tensor(self.array) 39 | 40 | 41 | def numpize_sample(sample: Any) -> Any: 42 | """ 43 | Converts the input sample to a NumPy-compatible format. 44 | 45 | Args: 46 | sample (Any): The input sample to be converted. 47 | 48 | Returns: 49 | Any: The converted sample in a NumPy-compatible format. 50 | """ 51 | 52 | if isinstance(sample, torch.Tensor): 53 | return NumpiedTensor(sample) 54 | elif isinstance(sample, tuple): 55 | return tuple(numpize_sample(s) for s in sample) 56 | elif isinstance(sample, list): 57 | return [numpize_sample(s) for s in sample] 58 | elif isinstance(sample, dict): 59 | return {k: numpize_sample(v) for k, v in sample.items()} 60 | else: 61 | return sample 62 | 63 | 64 | def tensorize_sample(sample: Any) -> Any: 65 | """ 66 | Converts the given sample into a tensor representation. 67 | 68 | Args: 69 | sample (Any): The input sample to be tensorized. 70 | 71 | Returns: 72 | Any: The tensorized representation of the input sample. 73 | """ 74 | 75 | if isinstance(sample, NumpiedTensor): 76 | return sample.to_tensor() 77 | elif isinstance(sample, tuple): 78 | return tuple(tensorize_sample(s) for s in sample) 79 | elif isinstance(sample, list): 80 | return [tensorize_sample(s) for s in sample] 81 | elif isinstance(sample, dict): 82 | return {k: tensorize_sample(v) for k, v in sample.items()} 83 | else: 84 | return sample 85 | 86 | 87 | T_co = TypeVar("T_co", covariant=True) 88 | 89 | 90 | class CachedDataset(Dataset[T_co]): 91 | """ 92 | A dataset wrapper that caches the samples to improve performance. 93 | 94 | Args: 95 | dataset (Dataset): The original dataset to be wrapped. 96 | 97 | Attributes: 98 | dataset (Dataset): The original dataset. 99 | manager (Manager): The multiprocessing manager. 100 | cache (dict): The cache dictionary to store the cached samples. 101 | """ 102 | 103 | def __init__(self, dataset: Dataset[T_co]) -> None: 104 | self.dataset = dataset 105 | 106 | self.manager = Manager() 107 | self.cache = self.manager.dict() 108 | 109 | def __len__(self) -> int: 110 | return len(self.dataset) # type: ignore[arg-type] 111 | 112 | def __getitem__(self, index: int) -> Any: 113 | if index not in self.cache: 114 | self.cache[index] = numpize_sample(self.dataset[index]) 115 | 116 | return tensorize_sample(self.cache[index]) 117 | -------------------------------------------------------------------------------- /examples/hpo/benchmark/plot.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | import glob 5 | 6 | from matplotlib.axes._axes import Axes 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | import pandas as pd 10 | 11 | 12 | def get_min_values(values: list[float]) -> list[float]: 13 | min_values = [] 14 | min_value = np.inf 15 | for value in values: 16 | if value < min_value: 17 | min_value = value 18 | min_values.append(min_value) 19 | 20 | return min_values 21 | 22 | 23 | def plot_dim_vs_min_value(ax: Axes, csv_names_for_dim: list[list[list[str]]], title: str) -> None: 24 | label_names = ["nm", "TPE-mcT-mvF", "nm+subTPE-mcT-mvF"] 25 | colors = ["r", "g", "b"] 26 | 27 | dim = ["2", "3", "5", "10", "20", "40"] 28 | 29 | # dim 30 | min_values_for_dim = [] 31 | for csv_names_for_algorism in csv_names_for_dim: 32 | # nm, tpe, nm_subtpe 33 | min_values_for_algorism = [] 34 | for csv_names in csv_names_for_algorism: 35 | min_values = [] 36 | for csv_name in csv_names: 37 | df = pd.read_csv(csv_name) 38 | 39 | df_values = df["value - f_opt"] 40 | print(df_values) 41 | print(min(df_values)) 42 | min_values.append(min(df_values)) 43 | min_values_for_algorism.append(min_values) 44 | min_values_for_dim.append(min_values_for_algorism) 45 | 46 | print(len(min_values_for_dim)) 47 | 48 | for i in range(3): 49 | values = [item[i] for item in min_values_for_dim] 50 | print(values) 51 | values_mean = np.array(values).mean(axis=1) 52 | values_std = np.array(values).std(axis=1) 53 | ax.errorbar( 54 | dim, 55 | values_mean, 56 | yerr=values_std, 57 | capsize=5, 58 | markersize=10, 59 | ecolor=colors[i], 60 | markeredgecolor=colors[i], 61 | color=colors[i], 62 | label=label_names[i], 63 | ) 64 | 65 | if title == "f6": 66 | ax.set_ylim(-1000, 10000) 67 | ax.set_title(title) 68 | ax.grid(axis="both") 69 | ax.legend(fontsize=6) 70 | 71 | 72 | def compare_optimizer(base_dir: str = ".") -> None: 73 | fig, ax = plt.subplots(5, 5, figsize=(16, 20)) 74 | if isinstance(ax, Axes): 75 | return 76 | 77 | for num_of_f in range(1, 25): 78 | result_csv_list_for_dim = [] 79 | for num_of_dm in [2, 3, 5, 10, 20, 40]: 80 | result_csv_patterns = [ 81 | f"{base_dir}/nelder-mead/optuna_csv/optuna-nelder-mead-func_id{num_of_f}-dim{num_of_dm}-instance*/f{num_of_f}/DM{num_of_dm:02}/result_bbob_f{num_of_f:03}_i*_d{num_of_dm:02}_*_fopt.csv", 82 | f"{base_dir}/TPE/optuna_csv/optuna-TPE-func_id{num_of_f}-dim{num_of_dm}-instance*/f{num_of_f}/DM{num_of_dm:02}/result_bbob_f{num_of_f:03}_i*_d{num_of_dm:02}_*_fopt.csv", 83 | f"{base_dir}/nelder-mead-subTPE/optuna_csv/optuna-nelder-mead-subTPE-func_id{num_of_f}-dim{num_of_dm}-instance*/f{num_of_f}/DM{num_of_dm:02}/result_bbob_f{num_of_f:03}_i*_d{num_of_dm:02}_*_fopt.csv", 84 | ] 85 | 86 | result_csv_list = [sorted(glob.glob(pattern)) for pattern in result_csv_patterns] 87 | print(result_csv_list) 88 | result_csv_list_for_dim.append(result_csv_list) 89 | 90 | plot_dim_vs_min_value( 91 | ax[int((num_of_f - 1) / 5), int((num_of_f - 1) % 5)], result_csv_list_for_dim, f"f{num_of_f}" 92 | ) 93 | 94 | plt.savefig("result_bbob_dim_vs_value-fopt_parallel.png") 95 | plt.show() 96 | 97 | 98 | if __name__ == "__main__": 99 | compare_optimizer() 100 | -------------------------------------------------------------------------------- /tests/config/test_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | import io 5 | from pathlib import Path 6 | 7 | from omegaconf import DictConfig 8 | from omegaconf import OmegaConf as oc # noqa: N813 9 | 10 | import pytest 11 | 12 | from aiaccel.config.config import pathlib2str_config, prepare_config, print_config, resolve_inherit 13 | 14 | 15 | def test_load_config() -> None: 16 | config = prepare_config(Path(__file__).parent / "test_conf.yaml") 17 | assert isinstance(config, DictConfig) 18 | del config["config_path"] 19 | del config["working_directory"] 20 | expected_config = { 21 | "A": [{"CC": "cc", "AA": "aa", "BB": "bb"}, {"AAA": "aaa"}], 22 | "B": {"AA": "dummy", "BB": "bb"}, 23 | "C": {"CC": "cc"}, 24 | "D": {"EE": "ee"}, 25 | "E": {"EE": "ee"}, 26 | "Eval": 1.5, 27 | } 28 | 29 | assert config == expected_config 30 | 31 | 32 | def test_resolve_inherit() -> None: 33 | loaded_config = oc.create( 34 | { 35 | "A": [{"_inherit_": ["${B}", "${C}"], "AA": "aa"}, {"AAA": "aaa"}], 36 | "B": {"AA": "dummy", "BB": "bb"}, 37 | "C": {"CC": "cc"}, 38 | "D": {"_inherit_": "${E}"}, 39 | "E": {"EE": "ee"}, 40 | } 41 | ) 42 | resolved_config = resolve_inherit(loaded_config) 43 | expected_config = { 44 | "A": [{"CC": "cc", "AA": "aa", "BB": "bb"}, {"AAA": "aaa"}], 45 | "B": {"AA": "dummy", "BB": "bb"}, 46 | "C": {"CC": "cc"}, 47 | "D": {"EE": "ee"}, 48 | "E": {"EE": "ee"}, 49 | } 50 | 51 | assert resolved_config == expected_config 52 | 53 | 54 | def test_resolve_path() -> None: 55 | config = prepare_config(Path(__file__).parent / "test_resolve_path.yaml") 56 | 57 | assert isinstance(config, DictConfig) 58 | 59 | 60 | def test_print_config(capfd: pytest.CaptureFixture[str]) -> None: 61 | conf = oc.create({"foo": {"bar": [1, 2, 3]}}) 62 | print_config(conf) 63 | 64 | stdout, _ = capfd.readouterr() 65 | 66 | # with open(Path(__file__).parent / "test_config_assets" / "print_config.txt", "w") as f: 67 | # f.write(stdout) # noqa: ERA001 68 | 69 | with open(Path(__file__).parent / "test_config_assets" / "print_config.txt") as f: 70 | stdout_target = f.read() 71 | 72 | assert stdout == stdout_target 73 | 74 | 75 | def test_pathlib2str_config() -> None: 76 | src_conf = oc.create({"foo": {"bar": Path("test/path")}}) 77 | dst_conf = pathlib2str_config(src_conf) 78 | 79 | assert isinstance(dst_conf.foo.bar, str) 80 | assert isinstance(src_conf.foo.bar, Path) 81 | 82 | 83 | def test_load_config_print_option(capfd: pytest.CaptureFixture[str]) -> None: 84 | prepare_config( 85 | Path(__file__).parent / "test_conf.yaml", 86 | print_config=True, 87 | print_config_kwargs={"line_length": 40}, 88 | ) 89 | 90 | stdout, _ = capfd.readouterr() 91 | assert "=" * 40 in stdout 92 | 93 | 94 | def test_print_config_kwargs() -> None: 95 | buffer = io.StringIO() 96 | conf = oc.create({"foo": 1}) 97 | print_config(conf, line_length=10, file=buffer) 98 | 99 | output = buffer.getvalue() 100 | assert "=" * 10 in output 101 | 102 | 103 | def test_load_config_save_option(tmp_path: Path) -> None: 104 | save_dir = tmp_path / "saved" 105 | config = prepare_config( 106 | Path(__file__).parent / "test_conf.yaml", 107 | working_directory=tmp_path, 108 | save_config=True, 109 | save_directory=save_dir, 110 | save_filename="custom.yaml", 111 | ) 112 | 113 | save_path = save_dir / "custom.yaml" 114 | 115 | assert save_path.exists() 116 | assert isinstance(config.working_directory, str) 117 | 118 | reloaded_config = oc.load(save_path) 119 | assert "config_path" in reloaded_config 120 | -------------------------------------------------------------------------------- /aiaccel/torch/lightning/datamodules/single_datamodule.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from typing import Any 5 | 6 | from collections.abc import Callable 7 | 8 | from torch.utils.data import DataLoader, Dataset 9 | 10 | import lightning as lt 11 | 12 | from aiaccel.torch.datasets import CachedDataset, scatter_dataset 13 | 14 | 15 | class SingleDataModule(lt.LightningDataModule): 16 | """ 17 | A PyTorch Lightning DataModule designed to handle training and validation datasets 18 | with support for caching and dataset scattering. 19 | 20 | Attributes: 21 | train_dataset_fn (Callable[..., Dataset[str]]): A callable function to create the training dataset. 22 | val_dataset_fn (Callable[..., Dataset[str]]): A callable function to create the validation dataset. 23 | batch_size (int): The batch size for the DataLoader. 24 | use_cache (bool): Whether to cache the datasets. Defaults to False. 25 | use_scatter (bool): Whether to scatter the datasets. Defaults to True. 26 | num_workers (int): Number of workers for the DataLoader. Defaults to 10. 27 | common_args (dict[str, Any] | None): Common arguments to pass to the dataset functions. Defaults to None. 28 | Methods: 29 | setup(stage: str | None) -> None: 30 | Prepares the datasets for training and validation. Only supports the "fit" stage. 31 | Raises a ValueError if the stage is not "fit". 32 | train_dataloader() -> DataLoader: 33 | Returns the DataLoader for the training dataset. 34 | val_dataloader() -> DataLoader: 35 | Returns the DataLoader for the validation dataset. 36 | _create_dataloader(dataset, **kwargs: Any) -> DataLoader: 37 | Internal method to create a DataLoader for a given dataset with specified configurations. 38 | """ 39 | 40 | def __init__( 41 | self, 42 | train_dataset_fn: Callable[..., Dataset[str]], 43 | val_dataset_fn: Callable[..., Dataset[str]], 44 | batch_size: int, 45 | use_cache: bool = False, 46 | use_scatter: bool = True, 47 | num_workers: int = 10, 48 | common_args: dict[str, Any] | None = None, 49 | ): 50 | super().__init__() 51 | 52 | self.train_dataset_fn = train_dataset_fn 53 | self.val_dataset_fn = val_dataset_fn 54 | 55 | self.common_args = common_args if common_args is not None else {} 56 | 57 | self.batch_size = batch_size 58 | 59 | self.use_cache = use_cache 60 | self.use_scatter = use_scatter 61 | 62 | self.num_workers = num_workers 63 | 64 | def setup(self, stage: str | None) -> None: 65 | if stage == "fit": 66 | train_dataset = self.train_dataset_fn(**self.common_args) 67 | val_dataset = self.val_dataset_fn(**self.common_args) 68 | 69 | print(f"Dataset size: {len(train_dataset)=}, {len(val_dataset)=}") # type: ignore 70 | 71 | if self.use_cache: 72 | train_dataset = CachedDataset(train_dataset) 73 | val_dataset = CachedDataset(val_dataset) 74 | 75 | if self.use_scatter: 76 | train_dataset = scatter_dataset(train_dataset) 77 | val_dataset = scatter_dataset(val_dataset) 78 | 79 | self.train_dataset = train_dataset 80 | self.val_dataset = val_dataset 81 | else: 82 | raise ValueError("`stage` is not 'fit'.") 83 | 84 | def _create_dataloader(self, dataset: Dataset[Any], **kwargs: Any) -> DataLoader[Any]: 85 | return DataLoader( 86 | dataset=dataset, 87 | batch_size=self.batch_size, 88 | num_workers=self.num_workers, 89 | persistent_workers=True, 90 | shuffle=True, 91 | pin_memory=True, 92 | **kwargs, 93 | ) 94 | 95 | def train_dataloader(self) -> DataLoader[Any]: 96 | return self._create_dataloader(self.train_dataset, drop_last=True) 97 | 98 | def val_dataloader(self) -> DataLoader[Any]: 99 | return self._create_dataloader(self.val_dataset, drop_last=False) 100 | -------------------------------------------------------------------------------- /docs/source/user_guide/config.rst: -------------------------------------------------------------------------------- 1 | ######################### 2 | Managing Configurations 3 | ######################### 4 | 5 | This guide introduces how to manage configuration files using ``aiaccel.config`` and 6 | `Hydra's instantiation mechanism 7 | `_. The key features of 8 | ``aiaccel.config`` are: 9 | 10 | - Modular programming through YAML meta-programming 11 | - Efficient management of multiple config files using ``_base_`` and ``_inherit_`` 12 | attributes 13 | - Easy version control integration with Git 14 | - Minimal dependency on Hydra (only uses ``hydra.utils.instantiate``) 15 | 16 | ***************** 17 | Getting Started 18 | ***************** 19 | 20 | Aiaccel's configuration system is based on `OmegaConf 21 | `_. The typical usage is: 22 | 23 | .. code-block:: yaml 24 | :caption: config.yaml 25 | 26 | model: 27 | _target_: torchvision.models.resnet50 28 | num_classes: 13 29 | 30 | .. code-block:: python 31 | :caption: example.py 32 | 33 | from argparse import ArgumentParser 34 | 35 | from aiaccel.config import ( 36 | prepare_config, 37 | print_config, 38 | ) 39 | from hydra.utils import instantiate 40 | 41 | 42 | parser = ArgumentParser() 43 | parser.add_argument("config", type=str, help="Config file in YAML format") 44 | args, unk_args = parser.parse_known_args() 45 | 46 | config = prepare_config(args.config) 47 | print_config(config) 48 | 49 | model = instantiate(config.model) 50 | 51 | print(model) 52 | 53 | ... 54 | 55 | To run the script: 56 | 57 | .. code-block:: bash 58 | 59 | python example.py config.yaml 60 | 61 | ``prepare_config`` wraps :func:`aiaccel.config.load_config`, processes the ``_base_`` 62 | attribute, resolves ``_inherit_`` entries, and returns the ready-to-use configuration 63 | while also allowing you to forward options to :func:`load_config` via 64 | ``load_config_kwargs``. 65 | 66 | ****************************** 67 | ``_base_`` and ``_inherit_`` 68 | ****************************** 69 | 70 | The ``_base_`` attribute allows you to inherit from another configuration file. 71 | 72 | Example base configuration: 73 | 74 | .. code-block:: yaml 75 | :caption: config_base.yaml 76 | 77 | params: 78 | _convert_: partial 79 | _target_: aiaccel.hpo.optuna.hparams_manager.HparamsManager 80 | x1: [0, 1] 81 | x2: 82 | _target_: aiaccel.hpo.optuna.hparams.Float 83 | low: 0.0 84 | high: 1.0 85 | log: false 86 | 87 | Example configuration that uses a base: 88 | 89 | .. code-block:: yaml 90 | :caption: config.yaml 91 | 92 | _base_: config_base.yaml 93 | n_trials: 100 94 | n_max_jobs: 4 95 | 96 | ``config.yaml`` is automatically expanded to include the contents of 97 | ```config_base.yaml``. 98 | 99 | The ``_inherit_`` attribute, on the other hand, allows you to duplicate and modify parts 100 | of the configuration. Example configuration: 101 | 102 | .. code-block:: yaml 103 | :caption: config.yaml 104 | 105 | params: 106 | _convert_: partial 107 | _target_: aiaccel.hpo.optuna.hparams_manager.HparamsManager 108 | x1: 109 | _inherit_: "${param}" 110 | x2: 111 | _inherit_: "${param}" 112 | 113 | objective: 114 | _target_: objective.main 115 | 116 | n_trials: 30 117 | n_max_jobs: 4 118 | 119 | param: 120 | _target_: aiaccel.hpo.optuna.hparams.Float 121 | low: 0.0 122 | high: 1.0 123 | log: false 124 | 125 | After processing, the configuration will be expanded so that ``x1`` and ``x2`` each 126 | include the contents of ``param`` along with their own ``name`` fields. 127 | 128 | ******************* 129 | ``eval`` Resolver 130 | ******************* 131 | 132 | The ``eval`` resolver allows arithmetic operations within the config. It makes use of 133 | safe eval. 134 | 135 | Example configuration: 136 | 137 | .. code-block:: yaml 138 | :caption: config.yaml 139 | 140 | n_trials: ${eval:"${n_max_jobs} * 10"} 141 | n_max_jobs: 4 142 | 143 | ********************* 144 | Version Controlling 145 | ********************* 146 | 147 | WIP 148 | 149 | ************************ 150 | Additional Information 151 | ************************ 152 | 153 | Detailed information is available at :doc:`API Reference <../api_reference/config>`. 154 | -------------------------------------------------------------------------------- /tests/hpo/apps/test_optimize.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from collections.abc import Callable, Generator 5 | from contextlib import AbstractContextManager, contextmanager 6 | import os 7 | from pathlib import Path 8 | import shutil 9 | import subprocess 10 | 11 | from hydra.utils import instantiate 12 | 13 | import pytest 14 | 15 | from aiaccel.config import prepare_config 16 | 17 | 18 | @pytest.fixture() 19 | def workspace_factory( 20 | tmp_path_factory: pytest.TempPathFactory, 21 | ) -> Callable[[str], AbstractContextManager[Path]]: 22 | @contextmanager 23 | def _factory(data_name: str = "single_objective") -> Generator[Path, None, None]: 24 | tmp_path = tmp_path_factory.mktemp("workspace") 25 | 26 | shutil.copytree(Path(__file__).parent / "data" / data_name, tmp_path, dirs_exist_ok=True) 27 | 28 | org_path = Path.cwd() 29 | 30 | try: 31 | os.chdir(tmp_path) 32 | yield tmp_path 33 | finally: 34 | os.chdir(org_path) 35 | 36 | return _factory 37 | 38 | 39 | def test_from_config(workspace_factory: Callable[..., AbstractContextManager[Path]]) -> None: 40 | with workspace_factory() as workspace: 41 | subprocess.run("aiaccel-hpo optimize --config=config.yaml", shell=True, check=True) 42 | 43 | assert (workspace / "optuna.db").exists() 44 | 45 | config = prepare_config(workspace / "merged_config.yaml") 46 | study = instantiate(config.study) 47 | assert len(study.get_trials()) == 15 48 | 49 | 50 | def test_from_cli(workspace_factory: Callable[..., AbstractContextManager[Path]]) -> None: 51 | with workspace_factory() as workspace: 52 | cmd = ( 53 | "aiaccel-hpo optimize" 54 | " working_directory=./cli/" 55 | " n_trials=15" 56 | " n_max_jobs=1" 57 | " params.x1='[0,1]'" 58 | " params.x2='[0,1]'" 59 | " study.sampler._target_=optuna.samplers.TPESampler" 60 | " study.sampler.seed=0" 61 | " --" 62 | " python ./objective.py --x1={x1} --x2={x2} {out_filename}" 63 | ) 64 | subprocess.run(cmd, shell=True, check=True) 65 | 66 | config = prepare_config(workspace / "cli" / "merged_config.yaml") 67 | study = instantiate(config.study) 68 | best_value = study.best_trial.value 69 | 70 | # check consistency with the config-style execution 71 | with workspace_factory() as workspace: 72 | subprocess.run("aiaccel-hpo optimize --config=config.yaml", shell=True, check=True) 73 | 74 | config = prepare_config(workspace / "merged_config.yaml") 75 | study = instantiate(config.study) 76 | 77 | assert best_value == study.best_trial.value 78 | 79 | 80 | def test_resume(workspace_factory: Callable[..., AbstractContextManager[Path]]) -> None: 81 | with workspace_factory() as workspace: 82 | subprocess.run("aiaccel-hpo optimize --config=config.yaml", shell=True, check=True) 83 | subprocess.run("aiaccel-hpo optimize --config=config.yaml", shell=True, check=True) 84 | 85 | config = prepare_config(workspace / "merged_config.yaml") 86 | study = instantiate(config.study) 87 | assert len(study.get_trials()) == 30 88 | 89 | 90 | def test_multi_objective(workspace_factory: Callable[..., AbstractContextManager[Path]]) -> None: 91 | with workspace_factory("multi_objective") as workspace: 92 | subprocess.run("aiaccel-hpo optimize --config=config.yaml", shell=True, check=True) 93 | 94 | config = prepare_config(workspace / "merged_config.yaml") 95 | study = instantiate(config.study) 96 | 97 | assert len(study.get_trials()) == 15 98 | 99 | assert all(len(trial.values) == 2 for trial in study.get_trials()) 100 | 101 | 102 | def test_from_cli_and_config(workspace_factory: Callable[..., AbstractContextManager[Path]]) -> None: 103 | with workspace_factory() as workspace: 104 | cmd = ( 105 | "aiaccel-hpo optimize" 106 | " n_trials=30" 107 | " n_max_jobs=1" 108 | " params.x1='[0,10]'" 109 | " params.x2='[0,10]'" 110 | " study.sampler._target_=optuna.samplers.TPESampler" 111 | " study.sampler.seed=0" 112 | " --config=config.yaml" 113 | " --" 114 | ) 115 | subprocess.run(cmd, shell=True, check=True) 116 | 117 | assert (workspace / "optuna.db").exists() 118 | 119 | config = prepare_config(workspace / "merged_config.yaml") 120 | study = instantiate(config.study) 121 | assert len(study.get_trials()) == 30 122 | -------------------------------------------------------------------------------- /aiaccel/hpo/apps/optimize.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from typing import Any 5 | 6 | import argparse 7 | from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait 8 | from datetime import datetime 9 | from importlib import resources 10 | import json 11 | from pathlib import Path 12 | import shlex 13 | import subprocess 14 | import sys 15 | 16 | from hydra.utils import instantiate 17 | from omegaconf import OmegaConf as oc # noqa: N813 18 | 19 | from optuna.trial import Trial 20 | 21 | from aiaccel.config import pathlib2str_config, prepare_config, print_config 22 | 23 | 24 | def main() -> None: 25 | # remove OmegaConf arguments from sys.argv 26 | oc_args = [] 27 | if "--" in sys.argv: # If there are additional arguments before '--', treat them as OmegaConf arguments 28 | sep_idx = sys.argv.index("--") 29 | sys.argv.pop(sep_idx) 30 | 31 | for ii in range(0, sep_idx)[::-1]: 32 | if "=" in sys.argv[ii] and not sys.argv[ii].startswith("-"): 33 | oc_args.append(sys.argv.pop(ii)) 34 | 35 | oc_args = list(reversed(oc_args)) 36 | 37 | # parse arguments 38 | parser = argparse.ArgumentParser( 39 | description="""\ 40 | A helper CLI to optimize hyperparameters using Optuna. 41 | See complete usage: https://aistairc.github.io/aiaccel/user_guide/hpo.html . 42 | 43 | Typical usages: 44 | aiaccel-hpo optimize params.x1=[0,1] params.x2=[0,1] -- ./objective.py --x1={x1} --x2={x2} {out_filename} 45 | aiaccel-hpo optimize --config=config.yaml ./objective.py --x1={x1} --x2={x2} {out_filename} 46 | """, 47 | formatter_class=argparse.RawTextHelpFormatter, 48 | ) 49 | parser.add_argument("--config", type=Path, default=None, help="Path to the configuration file.") 50 | parser.add_argument("command", nargs=argparse.REMAINDER) 51 | 52 | args = parser.parse_args() 53 | 54 | # load config 55 | if args.config is None: 56 | args.config = resources.files(f"{__package__}.config") / "default.yaml" 57 | working_directory = Path.cwd().resolve() / f"aiaccel-hpo_{datetime.now():%Y-%m-%d-%H-%M-%S}" 58 | else: 59 | working_directory = args.config.parent.resolve() 60 | 61 | config = prepare_config( 62 | config_filename=args.config, 63 | working_directory=working_directory, 64 | overwrite_config=oc.from_cli(oc_args), 65 | ) 66 | 67 | if len(args.command) > 0: 68 | config.command = args.command 69 | 70 | print_config(config) 71 | 72 | # save config 73 | config.working_directory = Path(config.working_directory) 74 | config.working_directory.mkdir(parents=True, exist_ok=True) 75 | 76 | with open(config.working_directory / "merged_config.yaml", "w") as f: 77 | oc.save(pathlib2str_config(config), f) 78 | 79 | # build study and hparams manager 80 | study = instantiate(config.study) 81 | params = instantiate(config.params) 82 | 83 | # main loop 84 | futures: dict[Any, tuple[Trial, Path]] = {} 85 | submitted_job_count = 0 86 | finished_job_count = 0 87 | 88 | with ThreadPoolExecutor(config.n_max_jobs) as pool: 89 | while finished_job_count < config.n_trials: 90 | active_jobs = len(futures.keys()) 91 | available_slots = max(0, config.n_max_jobs - active_jobs) 92 | 93 | # Submit job in ThreadPoolExecutor 94 | for _ in range(min(available_slots, config.n_trials - submitted_job_count)): 95 | trial = study.ask() 96 | 97 | out_filename = config.working_directory / f"trial_{trial.number:0>6}.json" 98 | 99 | future = pool.submit( 100 | subprocess.run, 101 | shlex.join(config.command).format( 102 | config=config, 103 | job_name=f"trial_{trial.number:0>6}", 104 | out_filename=out_filename, 105 | **params.suggest_hparams(trial), 106 | ), 107 | shell=True, 108 | check=True, 109 | ) 110 | 111 | futures[future] = trial, out_filename 112 | submitted_job_count += 1 113 | 114 | # Get result from out_filename and tell 115 | done_features, _ = wait(futures.keys(), return_when=FIRST_COMPLETED) 116 | for future in done_features: 117 | trial, out_filename = futures.pop(future) 118 | 119 | with open(out_filename) as f: 120 | y = json.load(f) 121 | 122 | out_filename.unlink() 123 | 124 | frozentrial = study.tell(trial, y) 125 | study._log_completed_trial(y if isinstance(y, list) else [y], frozentrial.number, frozentrial.params) 126 | finished_job_count += 1 127 | 128 | 129 | if __name__ == "__main__": 130 | main() 131 | -------------------------------------------------------------------------------- /aiaccel/config/git.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from dataclasses import dataclass 5 | import importlib.util 6 | import os 7 | from pathlib import Path 8 | import subprocess 9 | 10 | from omegaconf import DictConfig, ListConfig 11 | 12 | __all__ = [ 13 | "PackageGitStatus", 14 | "collect_git_status_from_config", 15 | "print_git_status", 16 | ] 17 | 18 | 19 | @dataclass 20 | class PackageGitStatus: 21 | """ 22 | Represents the Git status of a package. 23 | 24 | Attributes: 25 | package_name (str): The name of the package. 26 | commit_id (str): The current Git commit ID of the repository. 27 | status (list[str]): A list of uncommitted files in the repository. 28 | """ 29 | 30 | package_name: str 31 | commit_id: str 32 | status: list[str] 33 | 34 | def ready(self) -> bool: 35 | """ 36 | Determines if there are no uncommitted changes. 37 | 38 | Returns: 39 | bool: True if there are no uncommitted files, otherwise False. 40 | """ 41 | 42 | return len(self.status) == 0 43 | 44 | 45 | def collect_git_status_from_config(config: DictConfig | ListConfig) -> list[PackageGitStatus]: 46 | """ 47 | Collects the Git status of packages specified in the given configuration. 48 | 49 | Args: 50 | config (DictConfig | ListConfig): The configuration containing package references. 51 | 52 | Returns: 53 | list[PackageGitStatus]: A list of `PackageGitStatus` objects representing 54 | the Git status of the detected packages. 55 | """ 56 | 57 | status_list = [] 58 | 59 | package_names = collect_target_packages(config) 60 | package_names.sort() 61 | 62 | for package_name in package_names: 63 | status = get_git_status(package_name) 64 | 65 | if status is not None: 66 | status_list.append(status) 67 | 68 | return status_list 69 | 70 | 71 | def print_git_status(status: PackageGitStatus | list[PackageGitStatus]) -> None: 72 | """ 73 | Prints the Git status of a package or a list of packages. 74 | 75 | Args: 76 | status (PackageGitStatus | list[PackageGitStatus]): The Git status to print. 77 | """ 78 | 79 | status_list = status if isinstance(status, list) else [status] 80 | 81 | for status in status_list: 82 | print(f"{status.package_name} @ {status.commit_id}") 83 | for st in status.status: 84 | print(f" {st}") 85 | 86 | 87 | def get_git_status(package_name: str) -> PackageGitStatus | None: 88 | """ 89 | Retrieves the Git status of a given package. 90 | 91 | Args: 92 | package_name (str): The name of the package to check. 93 | 94 | Returns: 95 | PackageGitStatus | None: A `PackageGitStatus` object if the package is found 96 | and under Git control, otherwise None. 97 | """ 98 | 99 | # get package location 100 | spec = importlib.util.find_spec(package_name) 101 | 102 | if spec is None: 103 | return None 104 | 105 | if spec.origin is not None: 106 | module_path = Path(spec.origin).parent.resolve() 107 | elif spec.submodule_search_locations is not None: 108 | module_path = Path(os.path.abspath(spec.submodule_search_locations[0])).resolve() 109 | else: 110 | return None 111 | 112 | # get repository path 113 | result = subprocess.run(["git", "rev-parse", "--show-toplevel"], cwd=module_path, capture_output=True, text=True) 114 | if result.returncode != 0: 115 | return None 116 | 117 | repository_path = Path(result.stdout.splitlines()[0]).resolve() 118 | 119 | # check git_ignore 120 | result = subprocess.run(["git", "check-ignore", module_path], cwd=repository_path, capture_output=True, text=True) 121 | if result.returncode == 0: 122 | return None 123 | 124 | # get commit id 125 | result = subprocess.run(["git", "rev-parse", "HEAD"], cwd=repository_path, capture_output=True, text=True) 126 | commit_id = result.stdout.splitlines()[0] 127 | 128 | # check git status 129 | result = subprocess.run(["git", "status", "-s"], cwd=repository_path, capture_output=True, text=True) 130 | status = result.stdout.splitlines() 131 | 132 | return PackageGitStatus(package_name, commit_id, status) 133 | 134 | 135 | def collect_target_packages(config: ListConfig | DictConfig) -> list[str]: 136 | """ 137 | Extracts the names of target packages from the given configuration. 138 | 139 | Args: 140 | config (ListConfig | DictConfig): The configuration to process. 141 | 142 | Returns: 143 | list[str]: A list of package names extracted from the configuration. 144 | """ 145 | 146 | target_packages = set() 147 | 148 | def inner_func(_config: ListConfig | DictConfig) -> None: 149 | if isinstance(_config, DictConfig): 150 | for key, value in _config.items(): 151 | if key == "_target_": 152 | package_name, *_ = value.split(".") 153 | target_packages.add(package_name) 154 | 155 | inner_func(value) 156 | 157 | elif isinstance(_config, ListConfig): 158 | for item in _config: 159 | inner_func(item) 160 | 161 | inner_func(config) 162 | 163 | return list(target_packages) 164 | -------------------------------------------------------------------------------- /aiaccel/torch/lightning/opt_lightning_module.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from __future__ import annotations 5 | 6 | from typing import Any 7 | 8 | from collections.abc import Callable, Iterator 9 | from dataclasses import dataclass 10 | from fnmatch import fnmatch 11 | 12 | from torch import nn, optim 13 | 14 | import lightning as lt 15 | from lightning.pytorch.utilities.types import OptimizerLRSchedulerConfig 16 | 17 | 18 | @dataclass 19 | class OptimizerConfig: 20 | """ 21 | Configuration for the optimizer and scheduler in a LightningModule. 22 | 23 | Args: 24 | optimizer_generator (Callable[..., optim.Optimizer]): A callable that generates the optimizer. 25 | params_transformer (Callable[..., Iterator[tuple[str, Any]]] | None): A callable that transforms the parameters 26 | into a format suitable for the optimizer. If None, the parameters are used as is. Defaults to None. 27 | scheduler_generator (Callable[..., optim.lr_scheduler.LRScheduler] | None): 28 | A callable that generates the learning rate scheduler. If None, no scheduler is used. Defaults to None. 29 | scheduler_interval (str | None): The interval at which the scheduler is called. Defaults to "step". 30 | scheduler_monitor (str | None): The metric to monitor for the scheduler. Defaults to "validation/loss". 31 | """ 32 | 33 | optimizer_generator: Callable[..., optim.Optimizer] 34 | params_transformer: Callable[..., Iterator[tuple[str, Any]]] | None = None 35 | 36 | scheduler_generator: Callable[..., optim.lr_scheduler.LRScheduler] | None = None 37 | scheduler_interval: str | None = "step" 38 | scheduler_monitor: str | None = "validation/loss" 39 | 40 | 41 | def build_param_groups( 42 | named_params: Iterator[tuple[str, nn.Parameter]], 43 | groups: list[dict[str, Any]], 44 | ) -> list[dict[str, Any]]: 45 | """ 46 | Build parameter groups for the optimizer based on the provided patterns. 47 | 48 | Args: 49 | named_params (Iterator[tuple[str, nn.Parameter]]): An iterator of named parameters. 50 | groups (list[dict[str, Any]]): A list of dictionaries where each dictionary contains 51 | a "pattern" key that specifies the parameter names to match (``fnmatch``), and other optional keys. 52 | 53 | Example: 54 | In your config file, you might have: 55 | 56 | .. code-block:: yaml 57 | 58 | optimizer_config: 59 | _target_: aiaccel.torch.lightning.OptimizerConfig 60 | optimizer_generator: 61 | _partial_: True 62 | _target_: torch.optim.AdamW 63 | weight_decay: 0.01 64 | params_transformer: 65 | _partial_: True 66 | _target_: aiaccel.torch.lightning.build_param_groups 67 | groups: 68 | - pattern: "*bias" 69 | lr: 0.01 70 | - pattern: "*weight" 71 | lr: 0.001 72 | 73 | This will create two parameter groups: one for biases with a learning rate of 0.01 and another for weights with 74 | a learning rate of 0.001. 75 | """ 76 | remaining = dict(named_params) 77 | 78 | param_groups = [] 79 | for spec in groups: 80 | matched_params = [] 81 | for target in [spec["pattern"]] if isinstance(spec["pattern"], str) else spec["pattern"]: 82 | matched_params += [remaining.pop(name) for name in list(remaining.keys()) if fnmatch(name, target)] 83 | 84 | assert len(matched_params) > 0 85 | 86 | param_groups.append({"params": matched_params} | {k: v for k, v in spec.items() if k != "pattern"}) 87 | 88 | param_groups.append({"params": list(remaining.values())}) 89 | 90 | return param_groups 91 | 92 | 93 | class OptimizerLightningModule(lt.LightningModule): 94 | """ 95 | LightningModule subclass for models that use custom optimizers and schedulers. 96 | 97 | Args: 98 | optimizer_config (OptimizerConfig): Configuration object for the optimizer. 99 | 100 | Attributes: 101 | optcfg (OptimizerConfig): Configuration object for the optimizer. 102 | 103 | Methods: 104 | configure_optimizers: Configures the optimizer and scheduler for training. 105 | """ 106 | 107 | def __init__(self, optimizer_config: OptimizerConfig): 108 | super().__init__() 109 | 110 | self._optimizer_config = optimizer_config 111 | 112 | def configure_optimizers(self) -> optim.Optimizer | OptimizerLRSchedulerConfig: 113 | """ 114 | Configures the optimizer and scheduler for training. 115 | 116 | Returns: 117 | Union[optim.Optimizer, OptimizerLRSchedulerConfig]: The optimizer and scheduler configuration. 118 | """ 119 | 120 | params: Iterator[tuple[str, Any]] | Iterator[nn.Parameter] 121 | if self._optimizer_config.params_transformer is None: 122 | params = self.parameters() # just because backward compatibility 123 | else: 124 | params = self._optimizer_config.params_transformer(self.named_parameters()) 125 | 126 | optimizer = self._optimizer_config.optimizer_generator(params=params) # 127 | 128 | if self._optimizer_config.scheduler_generator is None: 129 | return optimizer 130 | else: 131 | assert self._optimizer_config.scheduler_interval is not None 132 | assert self._optimizer_config.scheduler_monitor is not None 133 | return { 134 | "optimizer": optimizer, 135 | "lr_scheduler": { 136 | "scheduler": self._optimizer_config.scheduler_generator(optimizer=optimizer), 137 | "interval": self._optimizer_config.scheduler_interval, 138 | "monitor": self._optimizer_config.scheduler_monitor, 139 | }, 140 | } 141 | -------------------------------------------------------------------------------- /aiaccel/torch/lightning/callbacks/load_pretrained.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | from __future__ import annotations 5 | 6 | from fnmatch import fnmatch 7 | import logging 8 | from pathlib import Path 9 | import re 10 | 11 | import torch 12 | 13 | import lightning as lt 14 | 15 | from aiaccel.torch.lightning import load_checkpoint 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | class LoadPretrainedCallback(lt.Callback): 21 | """Initialize a model from a pretrained checkpoint before training or validation. 22 | 23 | The callback loads weights from ``model_path`` once fitting or validation begins, 24 | matches finetune parameters to pretrained ones using glob-like patterns, and copies 25 | the matching weights into the finetune module before any optimization steps run. 26 | 27 | Args: 28 | model_path: Directory containing checkpoints saved by :func:`load_checkpoint`. 29 | target_patterns: Glob expressions that describe finetune parameters which 30 | should be initialized from pretrained weights. 31 | pattern_map: Optional mapping from finetune patterns to pretrained ones. 32 | Wildcards (``"*"``) are allowed and must appear the same number of times 33 | on both sides of the mapping. 34 | source_excludes: Optional pretrained-side glob patterns that should never be 35 | copied even when referenced by ``pattern_map``. 36 | target_excludes: Optional finetune-side glob patterns that should never be 37 | overwritten. 38 | config_name: Name of the checkpoint configuration to load. 39 | 40 | Example:: 41 | 42 | callback = LoadPretrainedCallback( 43 | model_path=Path("pretrain_ckpt"), 44 | target_patterns=["detr_module.*"], 45 | pattern_map={"backbone.*": "visual_backbone.*"}, 46 | source_excludes=["detr_module.heads.cls_head.*"], 47 | config_name="merged_config.yaml", 48 | ) 49 | trainer = lt.Trainer(callbacks=[callback]) 50 | trainer.fit(model) 51 | """ 52 | 53 | def __init__( 54 | self, 55 | model_path: Path, 56 | target_patterns: list[str], 57 | pattern_map: dict[str, str] | None = None, 58 | source_excludes: list[str] | None = None, 59 | target_excludes: list[str] | None = None, 60 | config_name: str = "merged_config.yaml", 61 | ) -> None: 62 | super().__init__() 63 | 64 | # remember configuration about where to load and which config to use 65 | self.model_path = Path(model_path) 66 | self.config_name = config_name 67 | 68 | pattern_map = pattern_map or {} 69 | assert set(pattern_map) <= set(target_patterns) 70 | 71 | # build pattern dictionary used to match finetune parameters to pretrained ones 72 | pattern_dict = {ptn: ptn for ptn in target_patterns} 73 | pattern_dict.update(pattern_map) 74 | 75 | # remember exclusion filters for finetune and pretrained parameters 76 | self.source_excludes = source_excludes or [] 77 | self.target_excludes = target_excludes or [] 78 | 79 | # cache the derived mappings and bookkeeping flags 80 | self._ptn_dict = pattern_dict 81 | self._loaded = False 82 | 83 | @torch.no_grad() 84 | def on_fit_start(self, trainer: lt.Trainer, pl_module: lt.LightningModule) -> None: # type: ignore[override] 85 | """Load pretrained weights and copy them into matching finetune parameters.""" 86 | if self._loaded: 87 | return 88 | 89 | # load pretrained checkpoint and copy it to CPU tensors 90 | src_model, *_ = load_checkpoint(self.model_path, self.config_name, device="cpu") 91 | src_state_dict = {name: weight.cpu() for name, weight in src_model.state_dict().items()} 92 | dst_state_dict = dict(pl_module.state_dict()) 93 | 94 | # iterate over each user-defined pattern rule 95 | for dst_ptn, src_ptn in self._ptn_dict.items(): 96 | assert dst_ptn.count("*") == src_ptn.count("*") 97 | rgx_ptn = re.compile("^" + re.escape(dst_ptn).replace(r"\*", "(.*)") + "$") 98 | update_state: dict[str, torch.Tensor] = {} 99 | 100 | # look for finetune parameters matching the current rule 101 | for dst_name, dst_weight in dst_state_dict.items(): 102 | match_ptn = rgx_ptn.fullmatch(dst_name) 103 | if not match_ptn: 104 | continue 105 | if any(fnmatch(dst_name, ptn) for ptn in self.target_excludes): 106 | continue 107 | 108 | groups = iter(match_ptn.groups()) 109 | src_name = "".join(next(groups) if ch == "*" else ch for ch in src_ptn) 110 | 111 | # ensure we only pull parameters that are not excluded 112 | if any(fnmatch(src_name, ptn) for ptn in self.source_excludes): 113 | continue 114 | 115 | # fetch pretrained tensor and check compatibility before scheduling update 116 | src_weight = src_state_dict.get(src_name) 117 | assert src_weight is not None, ( 118 | f"Pretrained key not found: pretrained['{src_name}'] (for finetune['{dst_name}'])." 119 | ) 120 | assert src_weight.shape == dst_weight.shape 121 | 122 | update_state[dst_name] = src_weight 123 | 124 | logger.debug(f"Parameter '{dst_name}' initialized from '{src_name}' in checkpoint.") 125 | 126 | # apply the collected updates for this rule and mark them as assigned 127 | assert update_state, f"No parameters matched rule: '{dst_ptn}' -> '{src_ptn}'." 128 | pl_module.load_state_dict(update_state, strict=False) 129 | 130 | for dst_name in update_state: 131 | dst_state_dict.pop(dst_name) 132 | 133 | # prevent re-loading so weights are only imported once 134 | self._loaded = True 135 | 136 | def on_validation_start(self, trainer: lt.Trainer, pl_module: lt.LightningModule) -> None: # type: ignore[override] 137 | """Ensure pretrained weights are loaded before running validation.""" 138 | self.on_fit_start(trainer, pl_module) 139 | -------------------------------------------------------------------------------- /aiaccel/torch/h5py/hdf5_writer.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 2 | # SPDX-License-Identifier: MIT 3 | 4 | import numpy.typing as npt 5 | from typing import Any, Generic, TypeVar 6 | 7 | from abc import ABCMeta, abstractmethod 8 | from functools import reduce 9 | import json 10 | from math import ceil 11 | from pathlib import Path 12 | 13 | from rich.progress import track 14 | 15 | import h5py 16 | 17 | T1 = TypeVar("T1") 18 | T2 = TypeVar("T2") 19 | 20 | 21 | class HDF5Writer(Generic[T1, T2], metaclass=ABCMeta): 22 | """ 23 | Abstract base class for writing data to an HDF5 file. 24 | 25 | This class provides methods to write data into HDF5 format, supporting both 26 | single-process and parallel (MPI-based) writing. Subclasses must implement 27 | `prepare_globals` and `prepare_group` to define how data is structured. 28 | 29 | Typical usage is supposed to be: 30 | 31 | .. code-block:: python 32 | 33 | class FooHDF5Writer(HDF5Writer): 34 | def prepare_globals(self): 35 | item_list = list(range(100)) 36 | 37 | offset = 10 38 | maximum = 50 39 | 40 | return item_list, (offset, maximum) 41 | 42 | def prepare_group(self, item, context): 43 | offset, maximum = context 44 | 45 | group_name = f"{item:04d} 46 | 47 | return {group_name: {"data": np.full([10, 10], offset + item).clip(maximum)}} 48 | 49 | writer = FooHDF5Writer() 50 | writer.write("test.hdf5", parallel=False) 51 | """ 52 | 53 | h5: h5py.File 54 | 55 | def _write(self, filename: Path) -> None: 56 | """ 57 | Write data to an HDF5 file using a single process. 58 | 59 | Args: 60 | filename (Path): Path to the output HDF5 file. 61 | """ 62 | 63 | # prepare globals 64 | items, context = self.prepare_globals() 65 | group_list = [] 66 | 67 | # write into hdf5 file 68 | with h5py.File(filename, "w") as h5: 69 | for item in track(items): 70 | groups = self.prepare_group(item, context) 71 | 72 | for group_name, datasets in groups.items(): 73 | g = h5.create_group(group_name) 74 | 75 | for dataset_name, data in datasets.items(): 76 | ds = g.create_dataset(dataset_name, data.shape, dtype=data.dtype) 77 | ds[:] = data 78 | 79 | group_list.append(group_name) 80 | 81 | with open(filename.with_suffix(".json"), "w") as f: 82 | json.dump(group_list, f) 83 | 84 | def _write_parallel(self, filename: Path) -> None: 85 | """ 86 | Write data to an HDF5 file using MPI for parallel processing. 87 | 88 | Args: 89 | filename (Path): Path to the output HDF5 file. 90 | """ 91 | 92 | # prepare MPI 93 | from mpi4py.MPI import COMM_WORLD 94 | 95 | comm = COMM_WORLD 96 | 97 | rank = comm.Get_rank() 98 | size = comm.Get_size() 99 | 100 | # prepare globals 101 | if rank == 0: 102 | items, context = self.prepare_globals() 103 | items = list(items) + (ceil(len(items) / size) * size - len(items)) * [None] 104 | 105 | globals_ = items, context 106 | else: 107 | globals_ = None 108 | 109 | items, context = comm.bcast(globals_, root=0) 110 | group_list = [] 111 | 112 | # write into hdf5 file 113 | with h5py.File(filename, "w", driver="mpio", comm=comm) as h5: 114 | track_ = track if rank == 0 else lambda x, **kwargs: x 115 | for item in track_(items[rank::size]): 116 | groups = self.prepare_group(item, context) if item is not None else {} 117 | 118 | groups_info = {} 119 | for group_name, datasets in groups.items(): 120 | groups_info[group_name] = {dset: (data.shape, data.dtype) for dset, data in datasets.items()} 121 | 122 | for group_name, datasets in reduce(dict.__or__, comm.allgather(groups_info)).items(): 123 | g = h5.create_group(group_name) 124 | 125 | for dataset_name, (shape, dtype) in datasets.items(): 126 | g.create_dataset(dataset_name, shape, dtype=dtype) 127 | 128 | group_list.append(group_name) 129 | 130 | for group_name, datasets in groups.items(): 131 | g = h5[group_name] # type: ignore 132 | 133 | for dataset_name, data in datasets.items(): 134 | g[dataset_name][:] = data # type: ignore 135 | 136 | if rank == 0: 137 | with open(filename.with_suffix(".json"), "w") as f: 138 | json.dump(group_list, f) 139 | 140 | def write(self, filename: Path, parallel: bool = False) -> None: 141 | """ 142 | Write data to an HDF5 file, optionally using parallel processing. 143 | 144 | Args: 145 | filename (Path): Path to the output HDF5 file. 146 | parallel (bool, optional): Whether to use parallel writing. Defaults to False. 147 | """ 148 | 149 | if not parallel: 150 | self._write(filename) 151 | else: 152 | self._write_parallel(filename) 153 | 154 | @abstractmethod 155 | def prepare_globals(self) -> tuple[list[T1], T2]: 156 | """ 157 | Prepare the global data required for writing. 158 | 159 | This method must be implemented by subclasses to provide the data items 160 | and any necessary context for processing. 161 | 162 | Returns: 163 | tuple[list[T1], T2]: A tuple containing a list of data items and 164 | context information. 165 | """ 166 | pass 167 | 168 | @abstractmethod 169 | def prepare_group(self, item: T1, context: T2) -> dict[str, dict[str, npt.NDArray[Any]]]: 170 | """ 171 | Prepare groups of datasets for writing to HDF5. 172 | 173 | This method must be implemented by subclasses to define how individual 174 | data items should be structured within the HDF5 file. 175 | 176 | Args: 177 | item (T1): A single data item. 178 | context (T2): Additional context for processing. 179 | 180 | Returns: 181 | dict[str, dict[str, npt.NDArray[Any]]]: A dictionary mapping group names 182 | to dataset dictionaries. 183 | """ 184 | pass 185 | -------------------------------------------------------------------------------- /examples/hpo/benchmark/experiment_coco.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST) 3 | # SPDX-License-Identifier: MIT 4 | 5 | from typing import Any 6 | 7 | import argparse 8 | from collections.abc import Callable 9 | from concurrent.futures import ThreadPoolExecutor 10 | import csv 11 | import os 12 | import time 13 | 14 | import pandas as pd 15 | 16 | import cocoex 17 | import optuna 18 | 19 | from aiaccel.hpo.optuna.samplers.nelder_mead_sampler import NelderMeadEmptyError, NelderMeadSampler 20 | 21 | 22 | def _optimize_sequential( 23 | study: optuna.Study, func: Callable[[list[float]], float], search_space: dict[str, tuple[int | float, int | float]] 24 | ) -> float | None: 25 | try: 26 | trial = study.ask() 27 | except NelderMeadEmptyError: 28 | return None 29 | param = [] 30 | for name, distribution in search_space.items(): 31 | param.append(trial.suggest_float(name, *distribution)) 32 | 33 | result = func(param) 34 | time.sleep(0.1) 35 | 36 | frozentrial = study.tell(trial, result) 37 | study._log_completed_trial([result], frozentrial.number, frozentrial.params) 38 | return result 39 | 40 | 41 | def _optimize_sequential_wrapper(args: list[Any]) -> float | None: 42 | return _optimize_sequential(*args) 43 | 44 | 45 | def optimize( 46 | study: optuna.Study, 47 | func: Callable[[list[float]], float], 48 | search_space: dict[str, tuple[int | float, int | float]], 49 | result_csv_name: str, 50 | num_trial: int = 1000, 51 | num_parallel: int = 10, 52 | ) -> None: 53 | csv_array: list[list[str | float]] = [["step", "value"]] 54 | 55 | with ThreadPoolExecutor(max_workers=num_parallel) as executor: 56 | for step in range(int(num_trial / num_parallel)): 57 | results = executor.map( 58 | _optimize_sequential_wrapper, [(study, func, search_space) for _ in range(num_parallel)] 59 | ) 60 | for result in results: 61 | if result is not None: 62 | csv_array.append([step, result]) 63 | 64 | with open(result_csv_name, "w") as f: 65 | writer = csv.writer(f) 66 | writer.writerows(csv_array) 67 | 68 | 69 | def create_optuna_result( 70 | study: optuna.Study, output_folder: str, problem: Any, optuna_seed: int, sampler_name: str 71 | ) -> None: 72 | study_df = study.trials_dataframe() 73 | result_dir = f"{sampler_name}/optuna_csv/{output_folder}/f{problem.id_function}/DM{problem.dimension:02}" 74 | os.makedirs(result_dir, exist_ok=True) 75 | study_df.to_csv(result_dir + f"/result_{problem.id}_{optuna_seed:03}.csv") 76 | 77 | 78 | def experiment_bbob() -> None: 79 | parser = argparse.ArgumentParser() 80 | parser.add_argument("--func_id") 81 | parser.add_argument("--dim") 82 | parser.add_argument("--instance") 83 | parser.add_argument("--optuna_seed") 84 | parser.add_argument("--sampler_name") 85 | args, _ = parser.parse_known_args() 86 | 87 | func_id = int(args.func_id) 88 | dim = int(args.dim) 89 | instance = int(args.instance) 90 | optuna_seed = int(args.optuna_seed) 91 | sampler_name = args.sampler_name 92 | 93 | ### input 94 | suite_name = "bbob" 95 | output_folder = f"optuna-{sampler_name}-func_id{func_id}-dim{dim}-instance{instance}" 96 | budget_multiplier = 200 # increase to 10, 100, ... 97 | 98 | ### prepare 99 | suite_options = f"function_indices: {func_id} dimensions: {dim} instance_indices: {instance}" 100 | print(suite_options) 101 | suite = cocoex.Suite(suite_name, "", suite_options) 102 | observer = cocoex.Observer(suite_name, "result_folder: " + output_folder) 103 | minimal_print = cocoex.utilities.MiniPrint() 104 | 105 | num_parallel = 10 106 | 107 | # ### go 108 | for problem in suite: # this loop will take several minutes or longer 109 | problem.observe_with(observer) # generates the data for cocopp post-processing 110 | 111 | search_space: dict[str, tuple[int | float, int | float]] = {} 112 | for i in range(problem.dimension): 113 | search_space[f"x{i}"] = (-5.0, 5.0) 114 | print(search_space) 115 | 116 | if sampler_name == "nelder-mead": 117 | # Nelder Mead(no sub sampler) 118 | study = optuna.create_study( 119 | sampler=NelderMeadSampler(search_space=search_space, seed=optuna_seed, block=False) 120 | ) 121 | elif sampler_name == "nelder-mead-subTPE": 122 | # NM+subTPE 123 | sub_sampler = optuna.samplers.TPESampler(seed=optuna_seed, consider_magic_clip=True, multivariate=False) 124 | study = optuna.create_study( 125 | sampler=NelderMeadSampler( 126 | search_space=search_space, seed=optuna_seed, block=False, sub_sampler=sub_sampler 127 | ) 128 | ) 129 | elif sampler_name == "TPE": 130 | # TPE 131 | study = optuna.create_study( 132 | sampler=optuna.samplers.TPESampler(seed=optuna_seed, consider_magic_clip=True, multivariate=False) 133 | ) 134 | else: 135 | raise ValueError(f"{sampler_name} is not defined.") 136 | 137 | num_trial = budget_multiplier * problem.dimension 138 | step_csv_dir = f"{sampler_name}/step_csv/{output_folder}/f{problem.id_function}/DM{problem.dimension:02}/" 139 | os.makedirs(step_csv_dir, exist_ok=True) 140 | optimize( 141 | study, 142 | problem, 143 | search_space, 144 | step_csv_dir + f"result_{problem.id}_{optuna_seed:03}.csv", 145 | num_trial, 146 | num_parallel, 147 | ) 148 | 149 | create_optuna_result(study, output_folder, problem, optuna_seed, sampler_name) 150 | 151 | optuna_seed += 1 152 | 153 | minimal_print(problem, final=problem.index == len(suite) - 1) 154 | 155 | # result - f_opt 156 | for i, problem in enumerate(suite): 157 | coco_file_path = ( 158 | "exdata/" 159 | + f"{output_folder}/" 160 | + f"data_f{problem.id_function}/bbobexp_f{problem.id_function}_DIM{problem.dimension}.rdat" 161 | ) 162 | 163 | with open(coco_file_path) as f: 164 | data = f.readlines() 165 | 166 | f_opt = float(data[i % 15].split(" ")[12][1:-1]) 167 | print(f_opt) 168 | 169 | optuna_result_dir = ( 170 | f"{sampler_name}/optuna_csv/{output_folder}/f{problem.id_function}/DM{problem.dimension:02}/" 171 | ) 172 | optuna_seed = i + 1 173 | df = pd.read_csv(optuna_result_dir + f"result_{problem.id}_{optuna_seed:03}.csv") 174 | df["value - f_opt"] = df["value"] - f_opt 175 | 176 | print(df) 177 | df.to_csv(optuna_result_dir + f"result_{problem.id}_{optuna_seed:03}_fopt.csv") 178 | 179 | step_csv_dir = f"{sampler_name}/step_csv/{output_folder}/f{problem.id_function}/DM{problem.dimension:02}/" 180 | df = pd.read_csv(step_csv_dir + f"result_{problem.id}_{optuna_seed:03}.csv") 181 | df["value - f_opt"] = df["value"] - f_opt 182 | 183 | print(df) 184 | df.to_csv(step_csv_dir + f"result_{problem.id}_{optuna_seed:03}_fopt.csv") 185 | 186 | 187 | if __name__ == "__main__": 188 | experiment_bbob() 189 | --------------------------------------------------------------------------------