├── docs
    ├── image
    │   ├── favicon.ico
    │   └── logo_aiaccel.png
    ├── source
    │   ├── _static
    │   │   └── logo_aiaccel.png
    │   ├── api_reference
    │   │   ├── index.rst
    │   │   ├── config.rst
    │   │   ├── hpo.rst
    │   │   └── torch.rst
    │   ├── contribution_guide
    │   │   ├── index.rst
    │   │   ├── issues.md
    │   │   ├── documentation.md
    │   │   ├── tests.md
    │   │   ├── coding_styles.md
    │   │   └── pull_requests.md
    │   ├── user_guide
    │   │   ├── index.md
    │   │   ├── torch.rst
    │   │   └── config.rst
    │   ├── index.rst
    │   └── conf.py
    ├── Makefile
    └── make.bat
├── MANIFEST.in
├── tests
    ├── config
    │   ├── test_base.yaml
    │   ├── test_resolve_path.yaml
    │   ├── test_conf.yaml
    │   ├── test_config_assets
    │   │   └── print_config.txt
    │   ├── apps
    │   │   ├── test_check_git.yaml
    │   │   └── test_check_git.py
    │   └── test_config.py
    ├── torch
    │   ├── datasets
    │   │   ├── test_hdf5_dataset_assets
    │   │   │   └── dataset.hdf5
    │   │   ├── test_cached_dataset.py
    │   │   ├── test_scatter_dataset.py
    │   │   └── test_hdf5_dataset.py
    │   └── lightning
    │   │   └── test_abci_environment.py
    ├── job
    │   └── apps
    │   │   ├── config
    │   │       └── custom_local.yaml
    │   │   └── local.py
    └── hpo
    │   ├── apps
    │       ├── data
    │       │   ├── single_objective
    │       │   │   ├── config.yaml
    │       │   │   └── objective.py
    │       │   └── multi_objective
    │       │   │   ├── config.yaml
    │       │   │   └── objective.py
    │       └── test_optimize.py
    │   └── optuna
    │       ├── samplers
    │           ├── results_ackley_step.csv
    │           ├── results_ackley_int.csv
    │           ├── results_ackley.csv
    │           ├── results_ackley_logscale.csv
    │           └── results_shpere_parallel.csv
    │       └── test_hparams.py
├── examples
    ├── config
    │   └── basic
    │   │   ├── config.yaml
    │   │   └── example.py
    ├── hpo
    │   ├── benchmark
    │   │   ├── result_bbob_dim_vs_value-fopt_parallel.png
    │   │   ├── objective.sh
    │   │   ├── main_parallel_coco.py
    │   │   ├── README_ja.md
    │   │   ├── job_config.yaml
    │   │   ├── README.md
    │   │   ├── plot.py
    │   │   └── experiment_coco.py
    │   ├── basic
    │   │   ├── experiment
    │   │   │   └── config.yaml
    │   │   ├── objective.py
    │   │   └── README.md
    │   └── nelder_mead
    │   │   ├── example.py
    │   │   ├── README.md
    │   │   ├── example_parallel.py
    │   │   ├── example_sub_sampler.py
    │   │   ├── example_enqueue.py
    │   │   └── README_ja.md
    └── torch
    │   └── image_classification
    │       ├── recipes
    │           ├── resnet50.cifar10.ddp
    │           │   └── config.yaml
    │           └── resnet50.cifar10
    │           │   └── config.yaml
    │       ├── pyproject.toml
    │       ├── src
    │           └── image_classification
    │           │   ├── small_resnet50.py
    │           │   └── task.py
    │       └── README.md
├── LICENSE_HEADER
├── .gitattributes
├── aiaccel
    ├── hpo
    │   ├── __init__.py
    │   ├── apps
    │   │   ├── __init__.py
    │   │   ├── config
    │   │   │   ├── __init__.py
    │   │   │   └── default.yaml
    │   │   └── optimize.py
    │   ├── optuna
    │   │   ├── __init__.py
    │   │   ├── samplers
    │   │   │   └── __init__.py
    │   │   ├── hparams.py
    │   │   └── hparams_manager.py
    │   └── algorithms
    │   │   └── __init__.py
    ├── torch
    │   ├── __init__.py
    │   ├── apps
    │   │   ├── __init__.py
    │   │   ├── config
    │   │   │   ├── __init__.py
    │   │   │   ├── train_base.yaml
    │   │   │   └── train_ddp.yaml
    │   │   └── train.py
    │   ├── h5py
    │   │   ├── __init__.py
    │   │   └── hdf5_writer.py
    │   ├── lr_schedulers
    │   │   ├── __init__.py
    │   │   └── sequential_lr.py
    │   ├── lightning
    │   │   ├── datamodules
    │   │   │   ├── __init__.py
    │   │   │   └── single_datamodule.py
    │   │   ├── callbacks
    │   │   │   ├── __init__.py
    │   │   │   ├── save_metric.py
    │   │   │   ├── print_unused_param.py
    │   │   │   └── load_pretrained.py
    │   │   ├── __init__.py
    │   │   ├── abci_environment.py
    │   │   ├── ckpt.py
    │   │   └── opt_lightning_module.py
    │   ├── pipelines
    │   │   └── __init__.py
    │   ├── functional
    │   │   ├── __init__.py
    │   │   └── linear_sum_assignment.py
    │   └── datasets
    │   │   ├── __init__.py
    │   │   ├── scatter_dataset.py
    │   │   ├── file_cached_dataset.py
    │   │   ├── hdf5_dataset.py
    │   │   └── cached_dataset.py
    ├── config
    │   ├── apps
    │   │   ├── __init__.py
    │   │   ├── get_value.py
    │   │   └── check_git.py
    │   ├── __init__.py
    │   └── git.py
    ├── job
    │   └── apps
    │   │   ├── config
    │   │       ├── __init__.py
    │   │       ├── local.yaml
    │   │       ├── slurm.yaml
    │   │       ├── sge.yaml
    │   │       └── pbs.yaml
    │   │   ├── local.py
    │   │   ├── __init__.py
    │   │   ├── slurm.py
    │   │   ├── sge.py
    │   │   └── pbs.py
    ├── __init__.py
    └── launcher.py
├── .readthedocs.yaml
├── mypy.ini
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
    └── workflows
    │   ├── lint.yaml
    │   ├── ci.yaml
    │   └── pypi-publish.yaml
├── LICENSE
├── typings
    └── h5py.pyi
├── .pre-commit-config.yaml
├── README.md
├── .gitignore
└── pyproject.toml


/docs/image/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aistairc/aiaccel/HEAD/docs/image/favicon.ico


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include aiaccel/hpo/apps/config/*.yaml
2 | include aiaccel/torch/apps/config/*.yaml
3 | 


--------------------------------------------------------------------------------
/tests/config/test_base.yaml:
--------------------------------------------------------------------------------
1 | A:
2 |   - AAA: base
3 | D:
4 |   _inherit_: ${E}
5 | E:
6 |   EE: ee
7 | 


--------------------------------------------------------------------------------
/docs/image/logo_aiaccel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aistairc/aiaccel/HEAD/docs/image/logo_aiaccel.png


--------------------------------------------------------------------------------
/examples/config/basic/config.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   _target_: torchvision.models.resnet50
3 |   num_classes: 13


--------------------------------------------------------------------------------
/tests/config/test_resolve_path.yaml:
--------------------------------------------------------------------------------
1 | _base_: ${resolve_pkg_path:aiaccel.hpo.apps.config}/default.yaml
2 | 
3 | 


--------------------------------------------------------------------------------
/docs/source/_static/logo_aiaccel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aistairc/aiaccel/HEAD/docs/source/_static/logo_aiaccel.png


--------------------------------------------------------------------------------
/LICENSE_HEADER:
--------------------------------------------------------------------------------
1 | Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
2 | SPDX-License-Identifier: MIT


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # SCM syntax highlighting & preventing 3-way merges
2 | pixi.lock merge=binary linguist-language=YAML linguist-generated=true
3 | 


--------------------------------------------------------------------------------
/aiaccel/hpo/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
2 | # SPDX-License-Identifier: MIT
3 | 
4 | 


--------------------------------------------------------------------------------
/aiaccel/torch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
2 | # SPDX-License-Identifier: MIT
3 | 
4 | 


--------------------------------------------------------------------------------
/aiaccel/config/apps/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
2 | # SPDX-License-Identifier: MIT
3 | 
4 | 


--------------------------------------------------------------------------------
/aiaccel/hpo/apps/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
2 | # SPDX-License-Identifier: MIT
3 | 
4 | 


--------------------------------------------------------------------------------
/aiaccel/hpo/optuna/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
2 | # SPDX-License-Identifier: MIT
3 | 
4 | 


--------------------------------------------------------------------------------
/aiaccel/torch/apps/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
2 | # SPDX-License-Identifier: MIT
3 | 
4 | 


--------------------------------------------------------------------------------
/tests/torch/datasets/test_hdf5_dataset_assets/dataset.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aistairc/aiaccel/HEAD/tests/torch/datasets/test_hdf5_dataset_assets/dataset.hdf5


--------------------------------------------------------------------------------
/aiaccel/hpo/apps/config/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
2 | # SPDX-License-Identifier: MIT
3 | 
4 | 


--------------------------------------------------------------------------------
/aiaccel/job/apps/config/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
2 | # SPDX-License-Identifier: MIT
3 | 
4 | 


--------------------------------------------------------------------------------
/aiaccel/torch/apps/config/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
2 | # SPDX-License-Identifier: MIT
3 | 
4 | 


--------------------------------------------------------------------------------
/examples/hpo/benchmark/result_bbob_dim_vs_value-fopt_parallel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aistairc/aiaccel/HEAD/examples/hpo/benchmark/result_bbob_dim_vs_value-fopt_parallel.png


--------------------------------------------------------------------------------
/docs/source/api_reference/index.rst:
--------------------------------------------------------------------------------
 1 | ###############
 2 |  API Reference
 3 | ###############
 4 | 
 5 | .. toctree::
 6 |     :maxdepth: 2
 7 | 
 8 |     config
 9 |     torch
10 |     hpo
11 | 


--------------------------------------------------------------------------------
/aiaccel/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
2 | # SPDX-License-Identifier: MIT
3 | 
4 | from importlib.metadata import version
5 | 
6 | __version__ = version(__package__)
7 | 


--------------------------------------------------------------------------------
/tests/config/test_conf.yaml:
--------------------------------------------------------------------------------
 1 | _base_: test_base.yaml
 2 | 
 3 | A:
 4 |   - _inherit_: ["${B}", "${C}"]
 5 |     AA: aa
 6 |   - AAA: aaa
 7 | 
 8 | B:
 9 |  AA: dummy
10 |  BB : bb
11 | 
12 | C:
13 |  CC: cc
14 | 
15 | Eval: ${eval:"(21 + 9) / (4 + (8 % 3) ** 4)"}
16 | 


--------------------------------------------------------------------------------
/tests/job/apps/config/custom_local.yaml:
--------------------------------------------------------------------------------
1 | _base_: ${resolve_pkg_path:aiaccel.job.apps.config}/local.yaml
2 | 
3 | script_prologue: |
4 |     echo Hostname: $(hostname)
5 | 
6 |     export CUDA_VISIBLE_DEVICES=all
7 | 
8 |     echo ${config_path} | tee config_path.txt
9 | 


--------------------------------------------------------------------------------
/aiaccel/torch/h5py/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
2 | # SPDX-License-Identifier: MIT
3 | 
4 | from aiaccel.torch.h5py.hdf5_writer import HDF5Writer
5 | 
6 | __all__ = [
7 |     "HDF5Writer",
8 | ]
9 | 


--------------------------------------------------------------------------------
/aiaccel/torch/lr_schedulers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
2 | # SPDX-License-Identifier: MIT
3 | 
4 | from aiaccel.torch.lr_schedulers.sequential_lr import SequentialLR
5 | 
6 | __all__ = ["SequentialLR"]
7 | 


--------------------------------------------------------------------------------
/examples/hpo/benchmark/objective.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #$-l rt_C.small=1
 4 | #$-cwd
 5 | 
 6 | source /etc/profile.d/modules.sh
 7 | module load gcc/13.2.0
 8 | module load python/3.10/3.10.14
 9 | source xxx/aiaccel_env/bin/activate
10 | 
11 | python3.10 experiment_coco.py $@
12 | 


--------------------------------------------------------------------------------
/tests/config/test_config_assets/print_config.txt:
--------------------------------------------------------------------------------
1 | ================================================================================
2 | [33mfoo[39m:
3 |   [33mbar[39m:
4 |   - 1
5 |   - 2
6 |   - 3
7 | ================================================================================
8 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: "2"
 2 | 
 3 | build:
 4 |   os: "ubuntu-22.04"
 5 |   tools:
 6 |     python: "3.10"
 7 | 
 8 | python:
 9 |   install:
10 |     - method: pip
11 |       path: .
12 |       extra_requirements:
13 |         - dev
14 | 
15 | sphinx:
16 |   configuration: docs/source/conf.py


--------------------------------------------------------------------------------
/aiaccel/hpo/optuna/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
2 | # SPDX-License-Identifier: MIT
3 | 
4 | from aiaccel.hpo.optuna.samplers.nelder_mead_sampler import NelderMeadSampler
5 | 
6 | __all__ = ["NelderMeadSampler"]
7 | 


--------------------------------------------------------------------------------
/aiaccel/torch/apps/config/train_base.yaml:
--------------------------------------------------------------------------------
 1 | trainer:
 2 |   _target_: lightning.Trainer
 3 |   default_root_dir: ${working_directory}
 4 | 
 5 |   logger:
 6 |     _target_: lightning.pytorch.loggers.TensorBoardLogger
 7 |     save_dir: ${working_directory}
 8 |     name: ''
 9 |     version: ''
10 | 


--------------------------------------------------------------------------------
/aiaccel/torch/lightning/datamodules/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
2 | # SPDX-License-Identifier: MIT
3 | 
4 | from aiaccel.torch.lightning.datamodules.single_datamodule import SingleDataModule
5 | 
6 | __all__ = ["SingleDataModule"]
7 | 


--------------------------------------------------------------------------------
/aiaccel/torch/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
2 | # SPDX-License-Identifier: MIT
3 | 
4 | from aiaccel.torch.pipelines.base_pipeline import BasePipeline, reorder_fields
5 | 
6 | __all__ = ["BasePipeline", "reorder_fields"]
7 | 


--------------------------------------------------------------------------------
/aiaccel/torch/functional/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
2 | # SPDX-License-Identifier: MIT
3 | 
4 | from aiaccel.torch.functional.linear_sum_assignment import linear_sum_assignment
5 | 
6 | __all__ = [
7 |     "linear_sum_assignment",
8 | ]
9 | 


--------------------------------------------------------------------------------
/examples/torch/image_classification/recipes/resnet50.cifar10.ddp/config.yaml:
--------------------------------------------------------------------------------
 1 | _base_:
 2 |   - ../resnet50.cifar10/config.yaml
 3 | 
 4 | trainer:
 5 |   devices: "auto"
 6 |   sync_batchnorm: true
 7 | 
 8 | datamodule:
 9 |   batch_size: 256
10 | 
11 | task:
12 |   optimizer_config:
13 |     optimizer_generator:
14 |       lr: 8.e-3


--------------------------------------------------------------------------------
/docs/source/contribution_guide/index.rst:
--------------------------------------------------------------------------------
 1 | ####################
 2 |  Contribution Guide
 3 | ####################
 4 | 
 5 | Thank you for contributing to aiaccel! This document introduces how to contribute.
 6 | 
 7 | .. toctree::
 8 |     :maxdepth: 2
 9 | 
10 |     issues
11 |     pull_requests
12 |     documentation
13 |     tests
14 |     coding_styles
15 | 


--------------------------------------------------------------------------------
/examples/torch/image_classification/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "image_classification"
 7 | version = "0.0.0"
 8 | requires-python = ">=3.10"
 9 | dependencies = [
10 |     "torchvision",
11 |     "torchmetrics",
12 |     "aiaccel"
13 | ]


--------------------------------------------------------------------------------
/aiaccel/torch/apps/config/train_ddp.yaml:
--------------------------------------------------------------------------------
 1 | _base_: ${resolve_pkg_path:aiaccel.torch.apps.config}/train_base.yaml
 2 | 
 3 | trainer:
 4 |   sync_batchnorm: true
 5 |   
 6 |   plugins:
 7 |     _target_: aiaccel.torch.lightning.abci_environment.ABCIEnvironment
 8 |   devices: ${oc.decode:${oc.env:OMPI_COMM_WORLD_LOCAL_SIZE}}
 9 |   num_nodes: ${oc.decode:${oc.env:OMPI_MCA_orte_num_nodes}}
10 | 


--------------------------------------------------------------------------------
/tests/hpo/apps/data/single_objective/config.yaml:
--------------------------------------------------------------------------------
 1 | _base_: ${resolve_pkg_path:aiaccel.hpo.apps.config}/default.yaml
 2 | 
 3 | study:
 4 |   sampler:
 5 |     _target_: optuna.samplers.TPESampler
 6 |     seed: 0
 7 | 
 8 | params:
 9 |   x1: [0, 1]
10 |   x2: [0, 1]
11 | 
12 | command: ["python", "${working_directory}/objective.py", "--x1={x1}", "--x2={x2}", "{out_filename}"]
13 | 
14 | n_trials: 15
15 | n_max_jobs: 1
16 | 


--------------------------------------------------------------------------------
/aiaccel/hpo/algorithms/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | from aiaccel.hpo.algorithms.nelder_mead_algorithm import NelderMeadAlgorism, NelderMeadCoefficient, NelderMeadEmptyError
 5 | 
 6 | __all__ = [
 7 |     "NelderMeadCoefficient",
 8 |     "NelderMeadEmptyError",
 9 |     "NelderMeadAlgorism",
10 | ]
11 | 


--------------------------------------------------------------------------------
/aiaccel/hpo/apps/config/default.yaml:
--------------------------------------------------------------------------------
 1 | db_filename: ${working_directory}/optuna.db
 2 | 
 3 | n_trials: 100
 4 | n_max_jobs: 10
 5 | 
 6 | study:
 7 |   _target_: optuna.create_study
 8 |   study_name: aiaccel-hpo
 9 |   storage:
10 |     _target_: optuna.storages.RDBStorage
11 |     url: sqlite:///${db_filename}
12 | 
13 |   load_if_exists: True
14 | 
15 | params:
16 |   _convert_: partial
17 |   _target_: aiaccel.hpo.optuna.hparams_manager.HparamsManager
18 | 


--------------------------------------------------------------------------------
/tests/hpo/apps/data/multi_objective/config.yaml:
--------------------------------------------------------------------------------
 1 | _base_: ${resolve_pkg_path:aiaccel.hpo.apps.config}/default.yaml
 2 | 
 3 | study:
 4 |   directions: ["minimize", "minimize"]
 5 |   sampler:
 6 |     _target_: optuna.samplers.NSGAIISampler
 7 |     seed: 0
 8 | 
 9 | params:
10 |   x1: [0, 1]
11 |   x2: [0, 1]
12 | 
13 | command: ["python", "${working_directory}/objective.py", "--x1={x1}", "--x2={x2}", "{out_filename}"]
14 | 
15 | n_trials: 15
16 | n_max_jobs: 1
17 | 


--------------------------------------------------------------------------------
/aiaccel/torch/lightning/callbacks/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
2 | # SPDX-License-Identifier: MIT
3 | 
4 | from aiaccel.torch.lightning.callbacks.load_pretrained import LoadPretrainedCallback
5 | from aiaccel.torch.lightning.callbacks.print_unused_param import PrintUnusedParam
6 | from aiaccel.torch.lightning.callbacks.save_metric import SaveMetricCallback
7 | 
8 | __all__ = ["SaveMetricCallback", "LoadPretrainedCallback", "PrintUnusedParam"]
9 | 


--------------------------------------------------------------------------------
/examples/hpo/basic/experiment/config.yaml:
--------------------------------------------------------------------------------
 1 | _base_: ${resolve_pkg_path:aiaccel.hpo.apps.config}/default.yaml
 2 | 
 3 | command: ["./objective.py", "--x1={x1}", "--x2={x2}", "{out_filename}"]
 4 | 
 5 | params:
 6 |   _convert_: partial
 7 |   _target_: aiaccel.hpo.optuna.hparams_manager.HparamsManager
 8 |   x1:
 9 |     _target_: aiaccel.hpo.optuna.hparams.Float
10 |     low: 0.0
11 |     high: 1.0
12 |   x2:
13 |     _target_: aiaccel.hpo.optuna.hparams.Float
14 |     low: 0.0
15 |     high: 1.0
16 | 
17 | n_trials: 100
18 | n_max_jobs: 50
19 | 


--------------------------------------------------------------------------------
/aiaccel/torch/lightning/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
2 | # SPDX-License-Identifier: MIT
3 | 
4 | from aiaccel.torch.lightning.abci_environment import ABCIEnvironment
5 | from aiaccel.torch.lightning.ckpt import load_checkpoint
6 | from aiaccel.torch.lightning.opt_lightning_module import OptimizerConfig, OptimizerLightningModule, build_param_groups
7 | 
8 | __all__ = ["ABCIEnvironment", "OptimizerConfig", "OptimizerLightningModule", "build_param_groups", "load_checkpoint"]
9 | 


--------------------------------------------------------------------------------
/examples/config/basic/example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | from argparse import ArgumentParser
 5 | 
 6 | from hydra.utils import instantiate
 7 | 
 8 | from aiaccel.config import prepare_config, print_config
 9 | 
10 | parser = ArgumentParser()
11 | parser.add_argument("config", type=str, help="Config file in YAML format")
12 | args, unk_args = parser.parse_known_args()
13 | 
14 | config = prepare_config(args.config)
15 | print_config(config)
16 | 
17 | model = instantiate(config.model)
18 | 
19 | print(model)
20 | 


--------------------------------------------------------------------------------
/aiaccel/torch/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | from aiaccel.torch.datasets.cached_dataset import CachedDataset
 5 | from aiaccel.torch.datasets.file_cached_dataset import FileCachedDataset
 6 | from aiaccel.torch.datasets.hdf5_dataset import HDF5Dataset, RawHDF5Dataset
 7 | from aiaccel.torch.datasets.scatter_dataset import scatter_dataset
 8 | 
 9 | __all__ = [
10 |     "CachedDataset",
11 |     "FileCachedDataset",
12 |     "RawHDF5Dataset",
13 |     "HDF5Dataset",
14 |     "scatter_dataset",
15 | ]
16 | 


--------------------------------------------------------------------------------
/examples/torch/image_classification/src/image_classification/small_resnet50.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | from torch import nn
 5 | 
 6 | from torchvision import models
 7 | 
 8 | 
 9 | class SmallResNet50(nn.Sequential):
10 |     def __init__(self, num_classes: int):
11 |         super().__init__()
12 | 
13 |         self.base = models.resnet50(num_classes=num_classes)
14 |         self.base.conv1 = nn.Conv2d(3, 64, 3, 1, 1, bias=False)
15 |         self.base.maxpool = nn.Identity()
16 |         self.base.fc = nn.Linear(2048, 10)
17 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | python_version = 3.10
 3 | cache_dir = .mypy_cache
 4 | mypy_path = typings/
 5 | 
 6 | plugins = numpy.typing.mypy_plugin
 7 | 
 8 | allow_redefinition = True
 9 | warn_unused_configs = True
10 | warn_redundant_casts = True
11 | show_error_codes = True
12 | show_column_numbers = True
13 | check_untyped_defs = True
14 | local_partial_types = True
15 | enable_error_code = possibly-undefined
16 | warn_unused_ignores = False
17 | 
18 | strict_optional = True
19 | warn_no_return = True
20 | disallow_any_unimported = True
21 | strict = True
22 | implicit_reexport = False
23 | ignore_missing_imports = True
24 | exclude = build
25 | 


--------------------------------------------------------------------------------
/examples/hpo/basic/objective.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | 
 3 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 4 | # SPDX-License-Identifier: MIT
 5 | 
 6 | import argparse
 7 | from pathlib import Path
 8 | 
 9 | if __name__ == "__main__":
10 |     parser = argparse.ArgumentParser()
11 | 
12 |     parser.add_argument("out_filename", type=Path)
13 |     parser.add_argument("--x1", type=float)
14 |     parser.add_argument("--x2", type=float)
15 | 
16 |     args = parser.parse_args()
17 | 
18 |     y = (args.x1**2) + (args.x2**2)
19 | 
20 |     with open(args.out_filename, "w") as f:
21 |         f.write(f"{y:f}")
22 | 


--------------------------------------------------------------------------------
/docs/source/api_reference/config.rst:
--------------------------------------------------------------------------------
 1 | #####################
 2 |  OmegaConf Utilities
 3 | #####################
 4 | 
 5 | ******************
 6 |  Config Utilities
 7 | ******************
 8 | 
 9 | .. currentmodule:: aiaccel.config
10 | 
11 | .. autosummary::
12 |     :toctree: generated/
13 | 
14 |     setup_omegaconf
15 |     prepare_config
16 |     load_config
17 |     print_config
18 |     resolve_inherit
19 |     pathlib2str_config
20 | 
21 | ***************
22 |  Git Utilities
23 | ***************
24 | 
25 | .. currentmodule:: aiaccel.config
26 | 
27 | .. autosummary::
28 |     :toctree: generated/
29 | 
30 |     collect_git_status_from_config
31 |     print_git_status
32 |     PackageGitStatus
33 | 


--------------------------------------------------------------------------------
/docs/source/user_guide/index.md:
--------------------------------------------------------------------------------
 1 | # User Guide
 2 | 
 3 | ## Installation
 4 | You can install aiaccel directly from PyPI:
 5 | ```bash
 6 | python -m pip install aiaccel
 7 | ```
 8 | 
 9 | ## Tutorials
10 | When you want to try the tutorials, we recommend setting up the environment with `pixi`, which installs aiaccel together with every required dependency.
11 | First, install `pixi` by following the instructions at: [https://pixi.sh/latest/installation/]()
12 | 
13 | ```bash
14 | git clone https://github.com/aistairc/aiaccel.git
15 | cd aiaccel
16 | pixi install
17 | pixi shell  # enter the environment interactively
18 | ```
19 | 
20 | ```{toctree}
21 | :maxdepth: 1
22 | config
23 | torch
24 | hpo
25 | ```
26 | 


--------------------------------------------------------------------------------
/tests/hpo/apps/data/multi_objective/objective.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | import argparse
 5 | 
 6 | 
 7 | def main() -> None:
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument("out_filename", type=str)
10 |     parser.add_argument("--x1", type=float)
11 |     parser.add_argument("--x2", type=float)
12 |     args = parser.parse_args()
13 | 
14 |     y1 = (args.x1 - 2) ** 2 + (args.x2 - 1) ** 2
15 |     y2 = args.x1 + args.x2
16 | 
17 |     with open(args.out_filename, "w") as f:
18 |         f.write(f"{[y1, y2]}")
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     main()
23 | 


--------------------------------------------------------------------------------
/tests/hpo/apps/data/single_objective/objective.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | import argparse
 5 | 
 6 | 
 7 | def main() -> None:
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument("out_filename", type=str)
10 |     parser.add_argument("--x1", type=float)
11 |     parser.add_argument("--x2", type=float)
12 |     args = parser.parse_args()
13 | 
14 |     y = (args.x1**2) - (4.0 * args.x1) + (args.x2**2) - args.x2 - (args.x1 * args.x2)
15 | 
16 |     with open(args.out_filename, "w") as f:
17 |         f.write(f"{y}")
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     main()
22 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/aiaccel/config/apps/get_value.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | import argparse
 5 | 
 6 | from omegaconf import OmegaConf as oc  # noqa: N813
 7 | 
 8 | from aiaccel.config.config import prepare_config
 9 | 
10 | 
11 | def main() -> None:
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument("config", help="Configuration file path")
14 |     parser.add_argument("key", help="Target key in the configration file")
15 | 
16 |     args, _ = parser.parse_known_args()
17 |     config = prepare_config(args.config)
18 | 
19 |     print(oc.select(config, args.key))
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     main()
24 | 


--------------------------------------------------------------------------------
/aiaccel/config/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | from aiaccel.config.config import (
 5 |     load_config,
 6 |     pathlib2str_config,
 7 |     prepare_config,
 8 |     print_config,
 9 |     resolve_inherit,
10 |     setup_omegaconf,
11 | )
12 | from aiaccel.config.git import PackageGitStatus, collect_git_status_from_config, print_git_status
13 | 
14 | __all__ = [
15 |     "prepare_config",
16 |     "load_config",
17 |     "pathlib2str_config",
18 |     "print_config",
19 |     "resolve_inherit",
20 |     "PackageGitStatus",
21 |     "collect_git_status_from_config",
22 |     "print_git_status",
23 |     "setup_omegaconf",
24 | ]
25 | 


--------------------------------------------------------------------------------
/docs/source/contribution_guide/issues.md:
--------------------------------------------------------------------------------
 1 | # Issues
 2 | When you find any problems or have requests for new features, please first check to ensure that there is no duplicate issues already posted.
 3 | We usually use Japanese for internal development, but we are more than happy to communicate with you in English.
 4 | 
 5 | ## Bug report
 6 | A bug report should briefly summarize the following details:
 7 | * What the bug is
 8 | * Steps to reproduce the bug
 9 | * What you expected to happen
10 | * The execution environment
11 | 
12 | ## Feature request
13 | 
14 | A feature request should briefly summarize the following details:
15 | 
16 | - If a bug is relevant, what it is
17 | - What new features do you want to achieve
18 | - Description of the implementation you consider
19 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | # Minimal makefile for Sphinx documentation
 5 | #
 6 | 
 7 | # You can set these variables from the command line, and also
 8 | # from the environment for the first two.
 9 | SPHINXOPTS    ?=
10 | SPHINXBUILD   ?= sphinx-build
11 | SOURCEDIR     = source
12 | BUILDDIR      = build
13 | 
14 | .PHONY: livehtml html apidoc
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | html:
19 | 	@$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
20 | 
21 | livehtml:
22 | 	sphinx-autobuild -b html "$(SOURCEDIR)" "$(BUILDDIR)/html" $(SPHINXOPTS)
23 | 


--------------------------------------------------------------------------------
/docs/source/api_reference/hpo.rst:
--------------------------------------------------------------------------------
 1 | #############################
 2 |  Hyperparameter Optimization
 3 | #############################
 4 | 
 5 | ************
 6 |  Algorithms
 7 | ************
 8 | 
 9 | .. currentmodule:: aiaccel.hpo.algorithms
10 | 
11 | .. autosummary::
12 |     :toctree: generated/
13 | 
14 |     NelderMeadAlgorism
15 | 
16 | ******************
17 |  Optuna Utilities
18 | ******************
19 | 
20 | Samplers
21 | ========
22 | 
23 | .. currentmodule:: aiaccel.hpo.optuna.samplers
24 | 
25 | .. autosummary::
26 |     :toctree: generated/
27 | 
28 |     NelderMeadSampler
29 | 
30 | Hparam
31 | ======
32 | 
33 | .. currentmodule:: aiaccel.hpo.optuna.hparams
34 | 
35 | .. autosummary::
36 |     :toctree: generated/
37 | 
38 |     Hparam
39 |     Const
40 |     Float
41 |     Int
42 |     Categorical
43 | 


--------------------------------------------------------------------------------
/aiaccel/config/apps/check_git.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | import argparse
 5 | 
 6 | from aiaccel.config.config import prepare_config
 7 | from aiaccel.config.git import collect_git_status_from_config, print_git_status
 8 | 
 9 | 
10 | def main() -> None:
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument("config", help="Configuration file path")
13 | 
14 |     args, _ = parser.parse_known_args()
15 |     config = prepare_config(args.config)
16 | 
17 |     if len(git_status := collect_git_status_from_config(config)) > 0:
18 |         print_git_status(git_status)
19 | 
20 |         exit(1)
21 |     else:
22 |         exit(0)
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     main()
27 | 


--------------------------------------------------------------------------------
/tests/config/apps/test_check_git.yaml:
--------------------------------------------------------------------------------
 1 | storage:
 2 |   _target_: optuna.storages.RDBStorage
 3 |   url: sqlite:///aiaccel_storage.db
 4 |   engine_kwargs:
 5 |     connect_args:
 6 |       timeout: 30
 7 | 
 8 | study:
 9 |   _target_: optuna.create_study
10 |   direction: minimize
11 |   storage: ${storage}
12 |   study_name: my_study
13 |   load_if_exists: false
14 |   sampler:
15 |     _target_: optuna.samplers.TPESampler
16 |     seed: 0
17 | 
18 | params:
19 |   _convert_: partial
20 |   _target_: aiaccel.hpo.optuna.hparams_manager.HparamsManager
21 |   x1:
22 |     _target_: aiaccel.hpo.optuna.hparams.Float
23 |     low: 0.0
24 |     high: 1.0
25 |     log: false
26 |   x2:
27 |     _target_: aiaccel.hpo.optuna.hparams.Float
28 |     low: 0.0
29 |     high: 1.0
30 |     log: false
31 | 
32 | n_trials: 30
33 | n_max_jobs: 1
34 | 


--------------------------------------------------------------------------------
/examples/hpo/nelder_mead/example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | import numpy as np
 5 | 
 6 | import optuna
 7 | 
 8 | from aiaccel.hpo.optuna.samplers.nelder_mead_sampler import NelderMeadSampler
 9 | 
10 | search_space = {
11 |     "x": (-10.0, 10.0),
12 |     "y": (-10.0, 10.0),
13 | }
14 | 
15 | 
16 | def sphere(trial: optuna.trial.Trial) -> float:
17 |     params = []
18 |     for name, distribution in search_space.items():
19 |         params.append(trial.suggest_float(name, *distribution))
20 | 
21 |     return float(np.sum(np.asarray(params) ** 2))
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     study = optuna.create_study(sampler=NelderMeadSampler(search_space=search_space, seed=42))
26 |     study.optimize(func=sphere, n_trials=100)
27 | 


--------------------------------------------------------------------------------
/aiaccel/job/apps/config/local.yaml:
--------------------------------------------------------------------------------
 1 | walltime: null
 2 | 
 3 | script_prologue: |
 4 |     echo Hostname: $(hostname)
 5 | 
 6 |     export CUDA_VISIBLE_DEVICES=all
 7 | 
 8 | cpu:
 9 |     job: "{command}"
10 | 
11 | cpu-array:
12 |     n_tasks_per_proc: null
13 |     n_procs: 24
14 |     job: "{command}"
15 | 
16 | gpu:
17 |     job: "{command}"
18 | 
19 | gpu-array:
20 |     n_tasks_per_proc: null
21 |     n_procs: 8
22 |     job: "CUDA_VISIBLE_DEVICES=$(( LOCAL_PROC_INDEX % {args.n_procs} )) {command}"
23 | 
24 | mpi:
25 |     n_nodes: null
26 |     job: |
27 |         mpirun -np {args.n_procs} \\
28 |             {command}
29 | 
30 | train:
31 |     job: |
32 |         mpirun -np {args.n_gpus} \\
33 |             -x MAIN_ADDR=$(hostname -i) \\
34 |             -x MAIN_PORT=3000 \\
35 |             -x COLUMNS=120 \\
36 |             -x PYTHONUNBUFFERED=true \\
37 |             {command}
38 | 


--------------------------------------------------------------------------------
/examples/hpo/nelder_mead/README.md:
--------------------------------------------------------------------------------
 1 | # Examples of NelderMeadSampler
 2 | 
 3 | ## 1. File Structure
 4 | 
 5 | ### example.py
 6 | ### example_parallel.py
 7 | ### example_enqueue.py
 8 | ### example_sub_sampler.py
 9 | 
10 | - This code demonstrates the general usage of NelderMeadSampler.
11 | - For more information, please refer to the following documents : docs/source/user_guide/hpo
12 | 
13 | ### coco
14 | 
15 | - This directory contains code for verifying NelderMeadSampler using the black-box optimization evaluation framework coco.
16 | - For details, please refer to the README.md in the relevant directory.
17 | 
18 | ## 2. Instructions
19 | 
20 | - After installing aiaccel and activating the virtual environment, run the corresponding file.
21 | 
22 | ```bash
23 | python example.py
24 | ```
25 | 
26 | ## 3. Checking Results
27 | 
28 | - The execution results of the example code are displayed in the standard output.
29 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/examples/hpo/nelder_mead/example_parallel.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | import time
 5 | 
 6 | import numpy as np
 7 | 
 8 | import optuna
 9 | 
10 | from aiaccel.hpo.optuna.samplers.nelder_mead_sampler import NelderMeadSampler
11 | 
12 | search_space = {
13 |     "x": (-10.0, 10.0),
14 |     "y": (-10.0, 10.0),
15 | }
16 | 
17 | 
18 | def sphere(trial: optuna.trial.Trial) -> float:
19 |     params = []
20 |     time.sleep(0.01)
21 | 
22 |     for name, distribution in search_space.items():
23 |         params.append(trial.suggest_float(name, *distribution))
24 | 
25 |     return float(np.sum(np.asarray(params) ** 2))
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     study = optuna.create_study(sampler=NelderMeadSampler(search_space=search_space, seed=42, block=True))
30 |     study.optimize(func=sphere, n_trials=100, n_jobs=3)
31 | 


--------------------------------------------------------------------------------
/tests/torch/datasets/test_cached_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | from collections import defaultdict
 5 | 
 6 | from torch.utils.data import Dataset
 7 | 
 8 | from aiaccel.torch.datasets.cached_dataset import CachedDataset
 9 | 
10 | 
11 | def test_cached_dataset() -> None:
12 |     class DummyDataset(Dataset[int]):
13 |         def __init__(self) -> None:
14 |             self.counter = defaultdict[int, int](lambda: 0)
15 | 
16 |         def __getitem__(self, index: int) -> int:
17 |             self.counter[index] += 1
18 |             return index
19 | 
20 |     orig_dataset = DummyDataset()
21 |     dataset = CachedDataset(orig_dataset)
22 | 
23 |     for _ in range(2):
24 |         for ii in range(5):
25 |             assert dataset[ii] == ii
26 | 
27 |     assert all(count == 1 for count in orig_dataset.counter.values())
28 | 


--------------------------------------------------------------------------------
/aiaccel/torch/lightning/callbacks/save_metric.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | import json
 5 | 
 6 | import lightning
 7 | 
 8 | 
 9 | class SaveMetricCallback(lightning.Callback):
10 |     """
11 |     Lightning Callback for save metric in fit ends.
12 | 
13 |     Args:
14 |         metric_name (str): Metric name to save
15 |         output_path (str): File name to save
16 |     """
17 | 
18 |     def __init__(self, metric_name: str, output_path: str) -> None:
19 |         super().__init__()
20 |         self.metric_name = metric_name
21 |         self.output_path = output_path
22 | 
23 |     def on_fit_end(self, trainer: lightning.Trainer, pl_module: lightning.LightningModule) -> None:
24 |         metric_value = trainer.callback_metrics[self.metric_name].item()
25 |         with open(self.output_path, "w") as f:
26 |             json.dump(metric_value, f)
27 | 


--------------------------------------------------------------------------------
/tests/hpo/optuna/samplers/results_ackley_step.csv:
--------------------------------------------------------------------------------
 1 | x,y,objective
 2 | -8,27.0,19.62720599480816
 3 | 12,5.5,18.625976717821782
 4 | -20,-21.0,19.668950397165975
 5 | -8,-2.5,15.605357241663704
 6 | 12,-23.5,21.23920263185265
 7 | -2,14.0,17.293294335267746
 8 | -26,6.0,19.54061203177721
 9 | 4,6.0,12.786686824116272
10 | -2,-11.0,15.88518677832371
11 | -2,-4.5,11.750950682245877
12 | 10,3.5,17.248230608441084
13 | -4,-1.0,8.836638915350669
14 | -10,-11.0,17.556692462851238
15 | 0,1.5,5.5411239587646826
16 | -2,5.0,10.661412934927588
17 | -2,3.0,7.9889108105187
18 | 2,5.0,10.661412934927588
19 | -2,0.5,6.776152740106655
20 | 0,-0.5,3.0836533599911533
21 | 0,-2.0,4.927233671124704
22 | 2,0.5,6.776152740106655
23 | 0,0.5,3.0836533599911533
24 | -2,-1.5,7.674511801927853
25 | 0,1.0,2.637531092108304
26 | 0,-0.5,3.0836533599911533
27 | 0,0.5,3.0836533599911533
28 | 0,0.0,0.0
29 | 0,0.5,3.0836533599911533
30 | 0,0.0,0.0
31 | 0,-0.5,3.0836533599911533
32 | 


--------------------------------------------------------------------------------
/examples/hpo/nelder_mead/example_sub_sampler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | import time
 5 | 
 6 | import numpy as np
 7 | 
 8 | import optuna
 9 | 
10 | from aiaccel.hpo.optuna.samplers.nelder_mead_sampler import NelderMeadSampler
11 | 
12 | search_space = {
13 |     "x": (-10.0, 10.0),
14 |     "y": (-10.0, 10.0),
15 | }
16 | 
17 | 
18 | def sphere(trial: optuna.trial.Trial) -> float:
19 |     params = []
20 |     time.sleep(0.01)
21 | 
22 |     for name, distribution in search_space.items():
23 |         params.append(trial.suggest_float(name, *distribution))
24 | 
25 |     return float(np.sum(np.asarray(params) ** 2))
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     study = optuna.create_study(
30 |         sampler=NelderMeadSampler(search_space=search_space, seed=42, sub_sampler=optuna.samplers.TPESampler(seed=42))
31 |     )
32 |     study.optimize(func=sphere, n_trials=100, n_jobs=3)
33 | 


--------------------------------------------------------------------------------
/examples/hpo/nelder_mead/example_enqueue.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | import numpy as np
 5 | 
 6 | import optuna
 7 | 
 8 | from aiaccel.hpo.optuna.samplers.nelder_mead_sampler import NelderMeadSampler
 9 | 
10 | search_space = {
11 |     "x": (-10.0, 10.0),
12 |     "y": (-10.0, 10.0),
13 | }
14 | 
15 | 
16 | def sphere(trial: optuna.trial.Trial) -> float:
17 |     params = []
18 |     for name, distribution in search_space.items():
19 |         params.append(trial.suggest_float(name, *distribution))
20 | 
21 |     return float(np.sum(np.asarray(params) ** 2))
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     study = optuna.create_study(sampler=NelderMeadSampler(search_space=search_space, seed=42))
26 |     study.enqueue_trial({"x": 1.0, "y": 1.0})
27 |     study.enqueue_trial({"x": 1.0, "y": 2.0})
28 |     study.enqueue_trial({"x": 2.0, "y": 1.0})
29 |     study.optimize(func=sphere, n_trials=100)
30 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/aiaccel/torch/lightning/callbacks/print_unused_param.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | import lightning as lt
 5 | from lightning.pytorch.utilities import rank_zero_warn
 6 | 
 7 | 
 8 | class PrintUnusedParam(lt.Callback):
 9 |     """Warn once when trainable parameters do not receive gradients."""
10 | 
11 |     def __init__(self) -> None:
12 |         super().__init__()
13 |         self._has_warned = False
14 | 
15 |     def on_after_backward(self, trainer: lt.Trainer, pl_module: lt.LightningModule) -> None:  # type: ignore[override]
16 |         """Emit a warning for parameters that never collected gradients."""
17 |         if self._has_warned or not trainer.is_global_zero:
18 |             return
19 | 
20 |         for name, param in pl_module.named_parameters():
21 |             if param.requires_grad and param.grad is None:
22 |                 rank_zero_warn(f"{name} is unused")
23 | 
24 |         self._has_warned = True
25 | 


--------------------------------------------------------------------------------
/tests/torch/datasets/test_scatter_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | from torch.utils.data import Dataset
 5 | 
 6 | from pytest_mock.plugin import MockerFixture
 7 | 
 8 | from aiaccel.torch.datasets.scatter_dataset import scatter_dataset
 9 | 
10 | 
11 | def test_scatter_dataset(mocker: MockerFixture) -> None:
12 |     class DummyDataset(Dataset[int]):
13 |         def __len__(self) -> int:
14 |             return 16
15 | 
16 |         def __getitem__(self, index: int) -> int:
17 |             return index
18 | 
19 |     orig_dataset = DummyDataset()
20 | 
21 |     mocker.patch("torch.distributed.get_world_size", return_value=4)
22 |     indices = []
23 |     for rank in range(4):
24 |         mocker.patch("torch.distributed.get_rank", return_value=rank)
25 |         dataset = scatter_dataset(orig_dataset)
26 | 
27 |         assert len(dataset) == len(orig_dataset) // 4
28 |         indices += list(dataset.indices)
29 | 
30 |     assert sorted(indices) == list(range(16))
31 | 


--------------------------------------------------------------------------------
/examples/torch/image_classification/README.md:
--------------------------------------------------------------------------------
 1 | # Training a ResNet50 on CIFAR-10
 2 | 
 3 | ## Setup
 4 | We assume the Python-environment setup at `examples/python`.
 5 | ```bash
 6 | pushd ../../python/
 7 | bash setup_python.sh
 8 | . activate.sh
 9 | popd
10 | ```
11 | 
12 | ```bash
13 | pip install -e .
14 | ```
15 | 
16 | ## Training on a single GPU
17 | ```bash
18 | qsub -I -P [group_name] -q rt_HG -l select=1 -l walltime=1:0:0
19 | 
20 | cd $PBS_O_WORKDIR
21 | . ../../python/activate.sh
22 | 
23 | cd recipes
24 | aiaccel-torch train resnet50.cifar10/config.yaml
25 | ```
26 | 
27 | ## Training on on multiple GPUs
28 | This script will automatically use all the GPUs in your computer. The hyperparameter is assumed to use eight GPUs.
29 | ```bash
30 | qsub -I -P [group_name] -q rt_HF -l select=1 -l walltime=1:0:0
31 | 
32 | cd $PBS_O_WORKDIR
33 | . ../../python/activate.sh
34 | 
35 | cd recipes
36 | aiaccel-torch train resnet50.cifar10.ddp/config.yaml
37 | ```
38 | 
39 | ## Detailed Descriptions [TDB]
40 | Detailed descriptions are available on the [aiaccel document](https://aistairc.github.io/aiaccel/user_guide/torch.html)


--------------------------------------------------------------------------------
/.github/workflows/lint.yaml:
--------------------------------------------------------------------------------
 1 | name: Lint
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: ["main", "develop/*"]
 6 |   pull_request:
 7 | 
 8 | jobs:
 9 |   lint:
10 |     name: Lint
11 |     runs-on: ${{ matrix.os }}
12 |     env:
13 |       PIP_INDEX_URL: https://download.pytorch.org/whl/cpu
14 |       PIP_EXTRA_INDEX_URL: https://pypi.org/simple
15 |     strategy:
16 |       matrix:
17 |         os: ['ubuntu-22.04']
18 |         python-version: ['3.10']
19 |     steps:
20 |       - uses: actions/checkout@v4
21 |         with:
22 |           fetch-depth: 0
23 |       - uses: actions/setup-python@v5
24 |         with:
25 |           python-version: ${{ matrix.python-version }}
26 |           cache: 'pip'
27 |           cache-dependency-path: pyproject.toml
28 |       - name: Install dependencies
29 |         run: |
30 |           pip install .[dev,github-actions]
31 |       - name: Perform ruff
32 |         run: |
33 |           ruff check
34 |           ruff format --check
35 |       - name: Perform mypy
36 |         run: |
37 |           mypy --config-file mypy.ini . --explicit-package-bases
38 |       - name: Perform docstrfmt
39 |         run: |
40 |           docstrfmt --check docs/source/
41 | 


--------------------------------------------------------------------------------
/docs/source/contribution_guide/documentation.md:
--------------------------------------------------------------------------------
 1 | (documentation-wip)=
 2 | # Documentation (WIP)
 3 | 
 4 | ## Docstrings
 5 | 
 6 | - Write a basic description of the implemented functions, the types and meanings of parameters and return values, and examples of their usage.
 7 | - Write in accordance with the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings).
 8 | - See also [Coding Conventions](coding_styles).
 9 | 
10 | ## Documentation
11 | 
12 | - Create source files for documentation in a directory under docs.
13 | - The recommended document file format is markdown format.
14 | - Create documentation for any major feature additions.
15 | 
16 | ## Confirming rendering
17 | 
18 | If you have added, changed, or modified documents, make sure that it renders correctly in the local environment.
19 | Move to the aiaccel directory and execute the following command to generate an API reference.
20 | 
21 | ~~~bash
22 | cd aiaccel
23 | sphinx-apidoc --maxdepth 2 -f -o ./docs/source/api_reference/ ./aiaccel/
24 | ~~~
25 | 
26 | Move to aiaccel/docs and build html files to see how the document is rendered.
27 | 
28 | ~~~bash
29 | cd docs
30 | make html
31 | ~~~
32 | 


--------------------------------------------------------------------------------
/aiaccel/launcher.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | 
 3 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 4 | # SPDX-License-Identifier: MIT
 5 | 
 6 | from argparse import ArgumentParser
 7 | import importlib
 8 | from pathlib import Path
 9 | import pkgutil
10 | import sys
11 | 
12 | 
13 | def main() -> None:
14 |     target_module = Path(sys.argv[0]).stem.split("-")[-1]
15 | 
16 |     package = importlib.import_module(f"aiaccel.{target_module}.apps")
17 | 
18 |     modules = [name.replace("_", "-") for _, name, ispkg in pkgutil.iter_modules(package.__path__) if not ispkg]
19 |     if not modules:
20 |         raise RuntimeError(f"No apps found in aiaccel.{target_module}.apps")
21 | 
22 |     parser = ArgumentParser(description=f"Run aiaccel-{target_module} apps.", add_help=False)
23 |     parser.add_argument("command", choices=modules, help="The command to run.")
24 |     args, unk_args = parser.parse_known_args()
25 | 
26 |     module = importlib.import_module(f"aiaccel.{target_module}.apps.{args.command.replace('-', '_')}")
27 | 
28 |     sys.argv = [str(module.__file__)] + unk_args
29 |     module.main()
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     main()
34 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: ["main", "develop/*"]
 6 |   pull_request:
 7 | 
 8 | jobs:
 9 |   ci:
10 |     name: CI
11 |     runs-on: ${{ matrix.os }}
12 |     env:
13 |       PIP_INDEX_URL: https://download.pytorch.org/whl/cpu
14 |       PIP_EXTRA_INDEX_URL: https://pypi.org/simple
15 |     strategy:
16 |       matrix:
17 |         os: [ubuntu-22.04]
18 |         python-version: ['3.10']
19 |         test_target: [config, hpo, torch]
20 |     steps:
21 |       - uses: actions/checkout@v4
22 |         with:
23 |           fetch-depth: 0
24 |       - uses: actions/setup-python@v5
25 |         with:
26 |           python-version: ${{ matrix.python-version }}
27 |           cache: 'pip'
28 |           cache-dependency-path: pyproject.toml
29 |       - name: Install dependencies
30 |         run: |
31 |           pip install -e .[dev,github-actions]
32 |       - name: Run pytest
33 |         run: |
34 |           pytest -v \
35 |             -n auto \
36 |             --cov=aiaccel/${{ matrix.test_target }} \
37 |             --cov-branch \
38 |             --cov-append \
39 |             --cov-report=term-missing \
40 |             tests/${{ matrix.test_target }}
41 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 National Institute of Advanced Industrial Science and Technology (AIST)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/typings/h5py.pyi:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from collections.abc import KeysView
 4 | from pathlib import Path
 5 | from types import TracebackType
 6 | 
 7 | # This stub is just for passing mypy.
 8 | class Dataset:
 9 |     def __setitem__(self, arg: Any, value: Any) -> None: ...
10 |     def __getitem__(self, key: Any) -> Any: ...
11 | 
12 | class Group:
13 |     def create_dataset(self, name: str, shape: Any | None = None, dtype: Any | None = None) -> Dataset: ...
14 |     def __getitem__(self, name: Any) -> Dataset: ...
15 |     def items(self) -> list[tuple[str, Any]]: ...
16 | 
17 | class File:
18 |     def __init__(
19 |         self,
20 |         name: str | Path,
21 |         mode: str = "r",
22 |         driver: str | None = None,
23 |         comm: Any | None = None,
24 |     ) -> None: ...
25 |     def __enter__(self) -> File: ...
26 |     def __exit__(
27 |         self,
28 |         ex_exc_type: type[BaseException] | None,
29 |         exc_value: BaseException | None,
30 |         traceback: TracebackType | None,
31 |     ) -> bool: ...
32 |     def keys(self) -> KeysView[str]: ...
33 |     def __getitem__(self, key: str) -> Group | Dataset: ...
34 |     def close(self) -> None: ...
35 |     def create_group(self, name: str) -> Group: ...
36 | 


--------------------------------------------------------------------------------
/examples/hpo/nelder_mead/README_ja.md:
--------------------------------------------------------------------------------
 1 | # NelderMeadSampler の examples
 2 | 
 3 | ## 1. ファイル構成
 4 | 
 5 | ### example.py
 6 | 
 7 | - 一般的な NelderMeadSampler の使い方を示したコードです.
 8 | - 最適化対象はベンチマーク関数 shpere になっています.(以下の example も記述が無い場合は同様)
 9 | 
10 | ### example_parallel.py
11 | 
12 | - 並列実行時の NelderMeadSampler の使い方を示したコードです.
13 | - NelderMeadSampler の引数 block=True, study.optimize の引数 n_jobs=3 として、並列実行を有効にしています.
14 | - 並列実行を有効にすることで、初期点計算と shrink 時の計算を並列化でき、直列実行と比べて高速化できます.
15 | 
16 | ### example_enqueue.py
17 | 
18 | - optuna.study.enqueue_trial 利用時の NelderMeadSampler の使い方を示したコードです.
19 | - ask-tell インタフェースを利用し、 NelderMeadSampler がパラメータの出力に失敗した時に、ランダムなパラメータを enqueue_trial で探索しています.
20 | 
21 | ### example_sub_sampler.py
22 | 
23 | - sub_sampler 機能の利用時の NelderMeadSampler の使い方を示したコードです.
24 | - NelderMeadSampler の引数 sub_sampler=optuna.samplers.TPESampler として、NelderMeadSampler がパラメータの出力に失敗した時に、TPESampler で探索しています.
25 | - sub_sampler 機能の利用時は、並列であっても引数 block=False にする必要があります. (block=False でも並列実行は可能です.)
26 | 
27 | ### coco
28 | 
29 | - ブラックボックス最適化評価用フレームワーク coco を用いた NelderMeadSampler の検証用コードを含んだディレクトリです.
30 | - 詳細は該当ディレクトリ内の README.md を参照してください.
31 | 
32 | ## 2. 動作説明
33 | 
34 | - aiaccel のインストール・仮想環境の activate 後に、該当ファイルを実行してください.
35 | 
36 | ```bash
37 | python example.py
38 | ```
39 | 
40 | ## 3. 結果の確認
41 | 
42 | - example コードの実行結果は、標準出力に表示されます.
43 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_install_hook_types:
 2 |   - pre-commit
 3 |   - pre-push
 4 | 
 5 | repos:
 6 |   - repo: https://github.com/astral-sh/ruff-pre-commit
 7 |     rev: v0.14.8
 8 |     hooks:
 9 |       - id: ruff
10 |         args: [--fix]
11 |       - id: ruff-format
12 |   - repo: https://github.com/pre-commit/mirrors-mypy
13 |     rev: v1.19.0
14 |     hooks:
15 |       - id: mypy
16 |         language: system
17 |         args: [--config-file, mypy.ini, --explicit-package-bases]
18 |   - repo: https://github.com/LilSpazJoekp/docstrfmt
19 |     rev: v2.0.0
20 |     hooks:
21 |       - id: docstrfmt
22 |         args: [--check, docs/source/, --extend-exclude, docs/source/api_reference/generated]
23 |         types_or: [rst]
24 |   - repo: https://github.com/Lucas-C/pre-commit-hooks
25 |     rev: v1.5.5
26 |     hooks:
27 |       - id: insert-license
28 |         name: "Insert license header"
29 |         args:
30 |           - --license-filepath=LICENSE_HEADER
31 |           - --detect-license-in-X-top-lines=16
32 |         types_or: [python, makefile]
33 | 
34 |   - repo: local
35 |     hooks:
36 |       - id: pytest
37 |         name: pytest
38 |         entry: pytest -v -x -n auto --cov=aiaccel/ --cov-append tests/
39 |         stages: [pre-push]
40 |         language: system
41 |         pass_filenames: false
42 |         always_run: true


--------------------------------------------------------------------------------
/tests/torch/lightning/test_abci_environment.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | import os
 5 | from unittest import mock
 6 | 
 7 | import pytest
 8 | 
 9 | from aiaccel.torch.lightning.abci_environment import ABCIEnvironment
10 | 
11 | 
12 | @mock.patch.dict(
13 |     os.environ,
14 |     {
15 |         "OMPI_COMM_WORLD_SIZE": "8",
16 |         "OMPI_COMM_WORLD_RANK": "6",
17 |         "OMPI_COMM_WORLD_LOCAL_RANK": "2",
18 |         "OMPI_COMM_WORLD_LOCAL_SIZE": "4",
19 |         "MAIN_ADDR": "192.168.0.1",
20 |         "MAIN_PORT": "3000",
21 |     },
22 | )
23 | def test_abci_environment() -> None:
24 |     env = ABCIEnvironment()
25 | 
26 |     assert env.creates_processes_externally
27 | 
28 |     assert env.main_address == "192.168.0.1"
29 |     assert env.main_port == 3000
30 | 
31 |     assert env.detect()
32 |     assert env.world_size() == 8
33 |     assert env.global_rank() == 6
34 |     assert env.node_rank() == 1
35 |     assert env.local_rank() == 2
36 | 
37 |     env.validate_settings(4, 2)
38 | 
39 |     with pytest.raises(ValueError, match=r"^`num_devices` should match.*"):
40 |         env.validate_settings(3, 2)
41 | 
42 |     with pytest.raises(ValueError, match=r"^`num_devices \* num_nodes` should match.*"):
43 |         env.validate_settings(4, 1)
44 | 


--------------------------------------------------------------------------------
/aiaccel/job/apps/config/slurm.yaml:
--------------------------------------------------------------------------------
 1 | walltime: "1:0:0"
 2 | 
 3 | script_prologue: |
 4 |     echo Job ID: $SLURM_JOBID
 5 |     echo Hostname: $(hostname)
 6 | 
 7 | sbatch: "sbatch --export=USE_SSH=1 --export=ALL"
 8 | 
 9 | cpu:
10 |     sbatch_args: "-p cpu1 -N 1"
11 |     job: "{command}"
12 | 
13 | cpu-array:
14 |     n_tasks_per_proc: 64
15 |     n_procs: 4
16 |     sbatch_args: "-p cpu1 -N 1 --array=1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))"
17 |     job: "{command}"
18 | 
19 | gpu:
20 |     sbatch_args: "-p gpu1 -N 1"
21 |     job: "{command}"
22 | 
23 | gpu-array:
24 |     n_tasks_per_proc: 64
25 |     n_procs: 4
26 |     sbatch_args: "-p gpu1 -N 1 --array=1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))"
27 |     job: "CUDA_VISIBLE_DEVICES=$(( LOCAL_PROC_INDEX % 8 )) {command}"
28 | 
29 | mpi:
30 |     n_nodes: 1
31 |     sbatch_args: >-
32 |         -p gpu1 -N {args.n_nodes} -n {args.n_procs}
33 |     job: srun -n {args.n_procs} --cpu-bind=none --distribution=block:block {command}
34 | 
35 | train:
36 |     sbatch_args: >-
37 |         -p gpu1 -N {args.n_gpus}
38 |     job: |
39 |         export MAIN_ADDR=$(hostname -i)
40 |         export MAIN_PORT=3000
41 |         export COLUMNS=120
42 |         export PYTHONUNBUFFERED=true
43 |         srun -n {args.n_gpus} --cpu-bind=none --distribution=block:block {command}
44 | 
45 | use_scandir: False
46 | 


--------------------------------------------------------------------------------
/tests/config/apps/test_check_git.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | from pathlib import Path
 5 | 
 6 | from pytest_mock import MockerFixture
 7 | 
 8 | from aiaccel.config.apps import check_git
 9 | from aiaccel.config.git import PackageGitStatus
10 | 
11 | 
12 | def test_check_git(mocker: MockerFixture) -> None:
13 |     mock_args = mocker.Mock()
14 |     mock_args.config = str(Path(__file__).parent / "test_check_git.yaml")
15 | 
16 |     mock_argparse = mocker.patch("argparse.ArgumentParser.parse_known_args")
17 |     mock_argparse.return_value = (mock_args, [])
18 | 
19 |     # Success
20 |     mock_func = mocker.patch("aiaccel.config.apps.check_git.collect_git_status_from_config")
21 |     mock_func.return_value = []
22 | 
23 |     try:
24 |         check_git.main()
25 |     except SystemExit as e:
26 |         if e.code != 0:
27 |             raise AssertionError() from e
28 | 
29 |     # Failed
30 |     mock_func = mocker.patch("aiaccel.config.apps.check_git.collect_git_status_from_config")
31 |     mock_func.return_value = [PackageGitStatus("test_package", "test_id", [])]
32 | 
33 |     try:
34 |         check_git.main()
35 |     except SystemExit as e:
36 |         if e.code != 1:
37 |             raise AssertionError() from e
38 |     else:
39 |         raise AssertionError()
40 | 


--------------------------------------------------------------------------------
/tests/hpo/optuna/samplers/results_ackley_int.csv:
--------------------------------------------------------------------------------
 1 | x,y,objective
 2 | -2,9.014286128198322,14.586422044008504
 3 | 4,1.973169683940732,9.375537945818003
 4 | -6,-6.880109593275947,14.844107008867795
 5 | -2,-0.6931908436032099,6.507841622369519
 6 | 4,-7.7343072878607995,15.311178036343385
 7 | 0,4.827137774183541,10.531632891221614
 8 | 2,-3.5471589338460197,10.452142477225951
 9 | 1,-1.4535847568386302,6.11338663731501
10 | -5,-4.119945284382571,12.343741882384256
11 | 2,0.4498909418599073,6.726787205494825
12 | -3,-2.5966665423017465,10.213321495706827
13 | 0,-0.31174842918050594,2.2163113158354655
14 | 5,-1.0721423424159262,10.429712991161596
15 | 0,-0.7879287183063894,2.97192765167404
16 | -1,0.3539076093517366,4.287560101074227
17 | 0,-0.09796548219585688,0.5135246948216725
18 | 0,0.37821480693002485,2.6103706703067395
19 | 0,0.08667892562092305,0.43335945922963504
20 | -1,0.3004618726055739,4.053178155393216
21 | 0,-0.15869585373398643,0.999428637748025
22 | 0,0.14740929715905438,0.9043866409467292
23 | 0,0.07088300943579284,0.328887506768492
24 | 0,0.2555274172525728,1.8077753439839057
25 | 0,-0.009592257333748577,0.029579255457047537
26 | -1,-0.025388173518878787,2.655523461532038
27 | 0,0.05866215083597304,0.25500217190066365
28 | 0,-0.021813115933571936,0.07431720082603732
29 | 0,-0.09006752410329355,0.4569753311471949
30 | 0,0.021479732101155946,0.07299288494041889
31 | 0,0.03370059070097575,0.12528010757207042
32 | 


--------------------------------------------------------------------------------
/aiaccel/torch/lr_schedulers/sequential_lr.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | from collections.abc import Callable
 5 | 
 6 | import torch
 7 | 
 8 | 
 9 | class SequentialLR(torch.optim.lr_scheduler.SequentialLR):
10 |     """
11 |     A wrapper of torch.optim.lr_scheduler.SequentialLR to use list of functions
12 |     to create schedulers.
13 | 
14 |     Args:
15 |         optimizer: Optimizer.
16 |         schedulers_fn: List of functions to create schedulers.
17 |         milestones: List of epoch indices. Must be increasing.
18 | 
19 |     ... code-block:: yaml
20 | 
21 |         scheduler_generator:
22 |           _partial_: True
23 |           _convert_: "all"
24 |           _target_: aiaccel.lr_schedulers.SequentialLR
25 |           schedulers_fn:
26 |             - _target_: torch.optim.lr_scheduler.LinearLR
27 |               _partial_: True
28 |               start_factor: 1.e-3
29 |               end_factor: 1.0
30 |               total_iters: 5000
31 |             - _target_: torch.optim.lr_scheduler.CosineAnnealingLR
32 |             _partial_: True
33 |             T_max: 95000
34 |           milestones: [5000]
35 |     """
36 | 
37 |     def __init__(
38 |         self,
39 |         optimizer: torch.optim.Optimizer,
40 |         schedulers_fn: list[Callable[[torch.optim.Optimizer], torch.optim.lr_scheduler._LRScheduler]],
41 |         milestones: list[int],
42 |     ):
43 |         super().__init__(optimizer, [fn(optimizer) for fn in schedulers_fn], milestones)
44 | 


--------------------------------------------------------------------------------
/examples/hpo/benchmark/main_parallel_coco.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | from concurrent.futures import ThreadPoolExecutor
 5 | from itertools import product
 6 | import subprocess
 7 | 
 8 | 
 9 | def main() -> None:
10 |     sampler_names = ["nelder-mead", "nelder-mead-subTPE", "TPE"]
11 |     func_ids = list(range(1, 25))
12 |     dims = [2, 3, 5, 10, 20, 40]
13 |     execute_times = ["0:01:00", "0:02:00", "0:03:00", "0:10:00", "0:30:00", "3:00:00"]
14 |     instances = list(range(1, 16))
15 |     optuna_seeds = list(range(1, 16))
16 | 
17 |     combinations = product(
18 |         sampler_names, func_ids, zip(dims, execute_times, strict=False), zip(instances, optuna_seeds, strict=False)
19 |     )
20 | 
21 |     with ThreadPoolExecutor() as pool:
22 |         for sampler_name, func_id, (dim, execute_time), (instance, optuna_seed) in combinations:
23 |             execute_time = "0:05:00" if sampler_name == "nelder-mead" else execute_time
24 |             print(sampler_name, (func_id, execute_time), dim, (instance, optuna_seed))
25 | 
26 |             aiaccel_job_command = f"""\
27 | aiaccel-job pbs --config job_config.yaml cpu --walltime {execute_time} log/job_{func_id}_{dim}_{instance}.log \
28 | -- python3.13 experiment_coco.py --func_id {func_id} --dim {dim} \
29 | --instance {instance} --optuna_seed {optuna_seed} --sampler_name {sampler_name}
30 | """
31 | 
32 |             pool.submit(subprocess.run, aiaccel_job_command, shell=True)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     main()
37 | 


--------------------------------------------------------------------------------
/tests/hpo/optuna/test_hparams.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | import optuna
 5 | 
 6 | from aiaccel.hpo.optuna.hparams import (
 7 |     Categorical,
 8 |     Const,
 9 |     Float,
10 |     Int,
11 | )
12 | 
13 | 
14 | def test_const() -> None:
15 |     const = Const(value=0.5)
16 |     assert const(trial=None, name="x1") == 0.5
17 | 
18 | 
19 | def test_float() -> None:
20 |     suggest_float = Float(low=0.0, high=1.0, step=None, log=False)
21 |     trial = optuna.create_study().ask()
22 | 
23 |     assert isinstance(suggest_float(trial=trial, name="x2"), float)
24 | 
25 | 
26 | def test_int() -> None:
27 |     suggest_int = Int(low=0, high=10, step=1, log=False)
28 |     trial = optuna.create_study().ask()
29 | 
30 |     assert isinstance(suggest_int(trial=trial, name="x3"), int)
31 | 
32 | 
33 | def test_categorical() -> None:
34 |     suggest_categorical = Categorical(choices=[0, 1, 2])
35 |     trial = optuna.create_study().ask()
36 | 
37 |     assert suggest_categorical(trial=trial, name="x4") in [0, 1, 2]
38 | 
39 | 
40 | def test_discrete_uniform() -> None:
41 |     suggest_discrete_uniform = Float(low=0.0, high=1.0, step=0.1)
42 |     trial = optuna.create_study().ask()
43 | 
44 |     assert isinstance(suggest_discrete_uniform(trial=trial, name="x5"), float)
45 | 
46 | 
47 | def test_log_uniform() -> None:
48 |     suggest_log_uniform = Float(low=0.1, high=1.0, log=True)
49 |     trial = optuna.create_study().ask()
50 | 
51 |     assert isinstance(suggest_log_uniform(trial=trial, name="x6"), float)
52 | 


--------------------------------------------------------------------------------
/aiaccel/hpo/optuna/hparams.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | from typing import Generic, TypeVar
 5 | 
 6 | from collections.abc import Sequence
 7 | from dataclasses import dataclass
 8 | 
 9 | from optuna.trial import Trial
10 | 
11 | T = TypeVar("T")
12 | 
13 | 
14 | @dataclass
15 | class Hparam(Generic[T]):
16 |     def __call__(self, trial: Trial, name: str) -> T:
17 |         raise NotImplementedError
18 | 
19 | 
20 | @dataclass
21 | class Const(Hparam[T]):
22 |     value: T
23 | 
24 |     def __call__(self, trial: Trial | None, name: str | None) -> T:
25 |         return self.value
26 | 
27 | 
28 | @dataclass
29 | class Float(Hparam[float]):
30 |     low: float
31 |     high: float
32 |     step: float | None = None
33 |     log: bool = False
34 | 
35 |     def __call__(self, trial: Trial, name: str) -> float:
36 |         return trial.suggest_float(name=name, low=self.low, high=self.high, step=self.step, log=self.log)
37 | 
38 | 
39 | @dataclass
40 | class Int(Hparam[int]):
41 |     low: int
42 |     high: int
43 |     step: int = 1
44 |     log: bool = False
45 | 
46 |     def __call__(self, trial: Trial, name: str) -> int:
47 |         return trial.suggest_int(name=name, low=self.low, high=self.high, step=self.step, log=self.log)
48 | 
49 | 
50 | @dataclass
51 | class Categorical(Hparam[None | bool | int | float | str]):
52 |     choices: Sequence[None | bool | int | float | str]
53 | 
54 |     def __call__(self, trial: Trial, name: str) -> None | bool | int | float | str:
55 |         return trial.suggest_categorical(name=name, choices=self.choices)
56 | 


--------------------------------------------------------------------------------
/examples/hpo/benchmark/README_ja.md:
--------------------------------------------------------------------------------
 1 | # coco を利用した NelderMeadSampler の検証用コード
 2 | 
 3 | ## 1. ファイル構成
 4 | 
 5 | ### nelder-mead
 6 | ### nelder-mead-subTPE
 7 | ### TPE
 8 | 
 9 | - 各 sampler の最適化結果の csv が格納されるディレクトリです.
10 | 
11 | ### experiment_coco.py
12 | 
13 | - coco を用いて検証を行う本体のコードです.
14 | - 次元数*20　step、10並列での実行を想定しています.
15 | - 実行すると optuna の結果した結果が `optuna_csv` に、並列ステップ毎の結果が `step_csv` に出力されます.
16 | 
17 | ### main_parallel_coco.py
18 | 
19 | - `job_dispatcher` を用いて各 sampler ・関数・次元毎にジョブを投入するコードです.
20 | 
21 | ### objective.sh
22 | 
23 | - `job_dispatcher` で投入する qsub 用のスクリプトです.
24 | 
25 | ### plot.py
26 | 
27 | - matplotlib を用いて各 sampler の結果をグラフ化するコードです.
28 | - 各 sampler のディレクトリの `optuna_csv` を参照します.
29 | 
30 | ### result_bbob_dim_vs_value-fopt_parallel.png
31 | 
32 | - `plot.py` を実行して出力した検証結果を可視化したグラフ画像です.
33 | - 横軸 次元数 縦軸 最適化結果の平均・偏差 のグラフが、ベンチマーク関数24個分並んでいます.
34 | 
35 | ## 2. 動作説明
36 | 
37 | - aiaccel のインストール・仮想環境の activate を行ってください.
38 | 
39 | - coco インストールを行ってください.
40 |   - 詳細は下記 git を参照してください.
41 |     https://github.com/numbbo/coco
42 | 
43 | - `objective.sh` の仮想環境、 `main_parallel_coco.py` の job_group を適切なパス・ ID に書き換えてください.
44 | - `main_parallel.py` を実行すると、各 sampler の検証が実行されます.
45 | - 結果は各ディレクトリ直下の `optuna_csv`, `step_csv` に保存されます.
46 | 
47 | ```bash
48 | cd nelder-mead
49 | python main_parallel.py
50 | ```
51 | 
52 | - `plot.py` の実行には、pandas, matplotlib のインストールが必要です.
53 | 
54 | ```bash
55 | pip install pandas matplotlib
56 | python plot.py
57 | ```
58 | 
59 | ## 3. 結果の確認
60 | 
61 | - 各 sampler の検証結果は sampler に対応したディレクトリ以下の `optuna_csv`, `step_csv` に出力されます.
62 | - `plot.py` の可視化結果は `result_bbob_dim_vs_value-fopt_parallel.png` に出力されます.
63 |   - 可視化結果からは、並列実行時には nelder-mead-subTPE の方が良い結果が出やすい傾向があることが分かります. ただし、関数によっては nelder-mead の方が良い結果が出ることもあります.


--------------------------------------------------------------------------------
/tests/job/apps/local.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | from pathlib import Path
 5 | import subprocess
 6 | 
 7 | import pytest
 8 | 
 9 | cmd = ["aiaccel-job", "local"]
10 | 
11 | 
12 | @pytest.mark.parametrize(
13 |     "base_args",
14 |     [
15 |         ["cpu"],
16 |         ["cpu", "--n_tasks=10"],
17 |         ["gpu"],
18 |         ["gpu", "--n_tasks=10"],
19 |     ],
20 | )
21 | def test_default(base_args: list[str], tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
22 |     monkeypatch.chdir(tmp_path)
23 |     log_path = tmp_path / "test.log"
24 | 
25 |     subprocess.run(cmd + base_args + [log_path, "--", "sleep", "0"], check=True)
26 | 
27 | 
28 | def test_config_from_argparse(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
29 |     monkeypatch.chdir(tmp_path)
30 |     log_path = tmp_path / "test.log"
31 | 
32 |     config_path = Path(__file__).parent / "config" / "custom_local.yaml"
33 | 
34 |     subprocess.run(cmd + ["--config", config_path, "cpu", log_path, "sleep", "0"], check=True)
35 | 
36 |     with open(tmp_path / "config_path.txt") as f:
37 |         assert Path(f.read().rstrip("\n")) == Path(config_path)
38 | 
39 | 
40 | def test_config_from_environ(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
41 |     monkeypatch.chdir(tmp_path)
42 |     log_path = tmp_path / "test.log"
43 | 
44 |     config_path = Path(__file__).parent / "config" / "custom_local.yaml"
45 |     monkeypatch.setenv("AIACCEL_JOB_CONFIG", str(config_path))
46 | 
47 |     subprocess.run(cmd + ["cpu", log_path, "--", "sleep", "0"], check=True)
48 | 
49 |     with open(tmp_path / "config_path.txt") as f:
50 |         assert Path(f.read().rstrip("\n")) == config_path
51 | 


--------------------------------------------------------------------------------
/aiaccel/torch/functional/linear_sum_assignment.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | import numpy as np
 5 | from scipy.optimize import linear_sum_assignment as scipy_linear_sum_assignment
 6 | 
 7 | import torch
 8 | 
 9 | 
10 | def linear_sum_assignment(cost_matrix: torch.Tensor, maximize: bool = False) -> tuple[torch.Tensor, torch.Tensor]:
11 |     """
12 |     Solve the linear sum assignment problem for a batch of cost matrices.
13 | 
14 |     Args:
15 |         cost_matrix (torch.Tensor): A tensor of shape (..., m, n)
16 |             representing the cost matrix for each assignment problem.
17 |         maximize (bool): If True, the problem is treated as a maximization problem.
18 |             If False, it is treated as a minimization problem. Defaults to False.
19 |     Returns:
20 |         tuple: A tuple containing two tensors:
21 |             - row_indices: Indices of the rows assigned to each column.
22 |             - col_indices: Indices of the columns assigned to each row.
23 |     """
24 | 
25 |     assert cost_matrix.ndim >= 2, "cost_matrix must have at least 2 dimensions"
26 | 
27 |     *batch_shape, m, n = cost_matrix.shape
28 | 
29 |     row_ind_list, col_ind_list = [], []
30 |     for cm in cost_matrix.reshape(-1, m, n).cpu().numpy():
31 |         row_ind, col_ind = scipy_linear_sum_assignment(cm, maximize=maximize)
32 | 
33 |         row_ind_list.append(row_ind)
34 |         col_ind_list.append(col_ind)
35 | 
36 |     row_indices = torch.from_numpy(np.stack(row_ind_list).reshape(*batch_shape, -1)).to(cost_matrix.device)
37 |     col_indices = torch.from_numpy(np.stack(col_ind_list).reshape(*batch_shape, -1)).to(cost_matrix.device)
38 | 
39 |     return row_indices, col_indices
40 | 


--------------------------------------------------------------------------------
/aiaccel/job/apps/config/sge.yaml:
--------------------------------------------------------------------------------
 1 | walltime: "1:0:0"
 2 | 
 3 | script_prologue: |
 4 |     echo Job ID: $JOB_ID
 5 |     echo Hostname: $(hostname)
 6 | 
 7 |     export CUDA_VISIBLE_DEVICES=all
 8 | 
 9 | qsub: "qsub -g $JOB_GROUP -l h_rt={args.walltime}"
10 | 
11 | cpu:
12 |     qsub_args: "-l cpu_40=1"
13 |     job: "{command}"
14 | 
15 | cpu-array:
16 |     n_tasks_per_proc: 128
17 |     n_procs: 20
18 |     qsub_args: "-l cpu_40=1 -t 1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))"
19 |     job: "{command}"
20 | 
21 | gpu:
22 |     qsub_args: "-l gpu_1=1"
23 |     job: "{command}"
24 | 
25 | gpu-array:
26 |     n_tasks_per_proc: 128
27 |     n_procs: 1
28 |     qsub_args: "-l gpu_1=1 -t 1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))"
29 |     job: "{command}"
30 | 
31 | mpi:
32 |     n_nodes: 1
33 |     qsub_args: "-l cpu_40={args.n_nodes}"
34 |     job: |
35 |         source /etc/profile.d/modules.sh
36 |         module load openmpi
37 | 
38 |         mpirun -np {args.n_procs} --npernode $(( {args.n_procs} / {args.n_nodes} )) \\
39 |             -mca pml ob1 -mca btl self,tcp -mca btl_tcp_if_include bond0 \\
40 |             {command}
41 | 
42 | train:
43 |     qsub_args: "-l $( (({args.n_gpus}==1)) && printf node_q || printf node_f )=$(( ({args.n_gpus} + 3) / 4 ))"
44 |     job: |
45 |         source /etc/profile.d/modules.sh
46 |         module load openmpi
47 | 
48 |         n_gpus=$(nvidia-smi -L | wc -l)
49 | 
50 |         mpirun -np {args.n_gpus} -map-by ppr:$n_gpus:node:PE=48 \\
51 |             -mca pml ob1 -mca btl self,tcp -mca btl_tcp_if_include bond0 \\
52 |             -x MAIN_ADDR=$(hostname -i) \\
53 |             -x MAIN_PORT=3000 \\
54 |             -x COLUMNS=120 \\
55 |             -x PYTHONUNBUFFERED=true \\
56 |             {command}
57 | 
58 | use_scandir: False
59 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center"><img src="https://raw.githubusercontent.com/aistairc/aiaccel/master/docs/image/logo_aiaccel.png" width="600"/></div>
 2 | <div align="center">
 3 |     <img src="https://img.shields.io/github/license/aistairc/aiaccel.svg" />
 4 |     <img src="https://img.shields.io/badge/Python-3.10-blue" />
 5 |     <a href="https://aiaccel.readthedocs.io/en/latest/index.html">
 6 |         <img src="https://app.readthedocs.org/projects/aiaccel/badge/?version=latest" />
 7 |     </a>
 8 |     <img src="https://github.com/aistairc/aiaccel/actions/workflows/ci.yaml/badge.svg" />
 9 | </div>
10 | 
11 | # AIST Toolkit for Accelerating Machine Learning Research
12 | 
13 | * **Research-Oriented**: designed to accelerate your research cycles written in Python
14 | * **HPC Optimized**: intended to use in HPC clusters, including [AI Bridging Cloud Infrastructure (ABCI)](https://abci.ai/)
15 | * **Highly Modular**: designed to let you pick up any part of aiaccel for your research project
16 | 
17 | # Key Features
18 | * [PyTorch/Lightning Toolkit](https://aistairc.github.io/aiaccel/api_reference/torch.html): training toolkit for HPC clusters.
19 | * [Hyperparameter Optimization (HPO)](https://aistairc.github.io/aiaccel/api_reference/hpo.html): ready-to-use HPO algorithms/tools.
20 | * [OmegaConf Utilities](https://aistairc.github.io/aiaccel/api_reference/config.html): OmegaConf-based config utilities.
21 | 
22 | 
23 | # Installation
24 | ```bash
25 | pip install aiaccel
26 | ```
27 |        
28 | # Acknowledgement
29 | * Part of this software was developed in a project commissioned by the New Energy and Industrial Technology Development Organization (NEDO).
30 | * Part of this software was developed by using ABCI 3.0 provided by AIST and AIST Solutions.
31 | * Part of this software was developed by using the TSUBAME4.0 supercomputer at Institute of Science Tokyo.
32 | 


--------------------------------------------------------------------------------
/docs/source/contribution_guide/tests.md:
--------------------------------------------------------------------------------
 1 | (test)=
 2 | # Tests
 3 | 
 4 | ## Adding tests
 5 | 
 6 | - aiaccel uses pytest for testing.
 7 | - Create a directory for unit test under tests directory.
 8 |   - The directory structure under `aiaccel/tests` corresponds to that under `aiaccel/aiaccel`, except for a few modeules such as config.py. For example, the test for `aiaccel/aiaccel/hpo/optuna/hparams.py` is `aiaccel/tests/hpo/optuna/test_hparams.py`.
 9 | - If you have added a new feature or bug fix, please create codes for testing.
10 | 
11 | 
12 | ## Running tests (WIP)
13 | 
14 | - Move to the aiaccel directory and execute the following command to run all codes for testing on the local environment.
15 | 
16 | ~~~bash
17 | cd aiaccel
18 | pytest
19 | ~~~
20 | 
21 | - Specify a file name as an argument to run only specific code.
22 | 
23 | ~~~bash
24 | pytest aiaccel/tests/hpo/optuna/test_hparams.py
25 | ~~~
26 | 
27 | - In addition, execute the following command to check coding styles.
28 | 
29 | ~~~bash
30 | ruff check
31 | ruff format --check
32 | mypy --config-file mypy.ini .
33 | docstrfmt --check docs/source/
34 | ~~~
35 | 
36 | 
37 | ## Coverages
38 | 
39 | No strict criteria for code coverage have been set, but this value should be fully considered when designing test. Plase note the following cases.
40 | 
41 | - Significantly lower overall score.
42 | - Abnormally low coverage of a class or module.
43 | - Test does not cover a specific branch of the if statement.
44 | 
45 | ### Measurement coverages
46 | 
47 | Run pytest with the option `--cov` to measure C0 coverage.
48 | 
49 | ~~~bash
50 | pytest --cov=aiaccel
51 | ~~~
52 | 
53 | - Replace `aiaccel` with the appropriate path to measure only the coverage of a specific test code.
54 | - Run pytest with the option `--cov` and `--cov-branch` to measure C1 coverage.
55 | 
56 | ~~~bash
57 | pytest --cov=aiaccel --cov-branch
58 | ~~~
59 | 


--------------------------------------------------------------------------------
/aiaccel/torch/datasets/scatter_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | import numpy.typing as npt
 5 | from typing import TypeVar
 6 | 
 7 | from collections.abc import Callable
 8 | 
 9 | import numpy as np
10 | 
11 | import torch.distributed as dist
12 | from torch.utils.data import Dataset, Subset
13 | 
14 | T = TypeVar("T")
15 | 
16 | 
17 | def scatter_dataset(
18 |     dataset: Dataset[T],
19 |     permute_fn: Callable[[npt.NDArray[np.int64]], npt.NDArray[np.int64]] | None = None,
20 | ) -> Subset[T]:
21 |     """
22 |     Splits a dataset into subsets and returns the subset corresponding to the current process rank.
23 | 
24 |     Args:
25 |         dataset (Dataset[T]): The input dataset to be split.
26 |         permute_fn (Callable[[npt.NDArray[np.int64]], npt.NDArray[np.int64]] | None, optional):
27 |             A function that takes an array of indices and returns a permuted version of the array.
28 |             If None, a default permutation function using np.random.Generator is used.
29 |             Defaults to None.
30 | 
31 |     Returns:
32 |         Subset[T]: The subset of the input dataset corresponding to the current process rank.
33 |     """
34 | 
35 |     if permute_fn is None:
36 |         permute_fn = np.random.Generator(np.random.PCG64(0)).permutation
37 | 
38 |     world_size = dist.get_world_size()
39 |     rank = dist.get_rank()
40 | 
41 |     dataset_size = len(dataset)  # type: ignore[arg-type]
42 |     total_size = int(np.ceil(dataset_size / world_size)) * world_size
43 | 
44 |     indices = permute_fn(np.arange(dataset_size))
45 |     repeated_indices = np.concatenate([indices, indices[: total_size - dataset_size]])
46 | 
47 |     split_indices = np.split(repeated_indices, world_size)
48 | 
49 |     return Subset(dataset, list(split_indices[rank]))
50 | 


--------------------------------------------------------------------------------
/tests/hpo/optuna/samplers/results_ackley.csv:
--------------------------------------------------------------------------------
 1 | x,y,objective
 2 | 3.745401188473625,9.50714306409916,17.40563670799186
 3 | 7.319939418114051,5.986584841970366,16.136857010179618
 4 | 1.5601864044243652,1.5599452033620265,7.684141829279023
 5 | 4.092732049871416,6.640204043382679,14.977162309601907
 6 | 5.073199322630971,5.043329732671359,12.912400157971089
 7 | 3.704712456699542,4.9709207556996855,12.974724337183765
 8 | 2.9286732703557936,1.6323541803337003,9.149709591293272
 9 | 3.658814580010525,3.3197397122596115,12.156637668428768
10 | 2.951622208700302,2.4579447020537373,10.106158974565025
11 | 1.5372374660798567,0.7343546816419897,6.41495086832353
12 | 0.16875060014842802,0.6619457046703161,3.578287844876108
13 | 1.2065902187692537,1.1290476982590896,5.27149310836656
14 | 1.1124539377693488,0.8149256915533463,4.47829195031602
15 | 0.07461431914852312,0.3478236979645728,2.528730695609976
16 | 0.6170681987089122,0.6599051964353954,4.589274831289604
17 | 0.12168245964847557,0.5048847013174445,3.2642552606368476
18 | 0.5935341284589359,0.5813746947589595,4.5089318967941985
19 | 0.34584125892871764,0.503864447199984,3.9172427824021305
20 | 0.2219948241636085,0.4651093234204963,3.454138754047353
21 | 0.16007160678105392,0.44573176153075245,3.198080417589434
22 | 0.11300346628110147,0.2886707581778808,2.280678955373986
23 | 0.10866396959741442,0.18056378660809896,1.4848792823714767
24 | 0.023206681964883624,0.08265572304191926,0.4282522158214057
25 | 0.07027482246483607,0.23971672639479097,1.789417957224991
26 | 0.061595829097461974,0.023502783255227255,0.2985090187707833
27 | 0.07553261256429361,0.11682151987833611,0.8464992601260697
28 | 0.058966934047733205,0.08495038651345468,0.5575744894703902
29 | 0.025835577014612393,0.02120811978369183,0.12407151097718838
30 | 0.033461192510460404,0.0525055872806894,0.2766147044342304
31 | 0.045622106929999184,0.030179818393708938,0.23278162079337505
32 | 


--------------------------------------------------------------------------------
/examples/torch/image_classification/src/image_classification/task.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | from torch.nn import functional as fn
 7 | 
 8 | from torchmetrics.classification import MulticlassAccuracy
 9 | 
10 | from aiaccel.torch.lightning import OptimizerConfig, OptimizerLightningModule
11 | 
12 | 
13 | class ImageClassificationTask(OptimizerLightningModule):
14 |     def __init__(self, model: nn.Module, optimizer_config: OptimizerConfig, num_classes: int = 10):
15 |         super().__init__(optimizer_config)
16 | 
17 |         self.model = model
18 | 
19 |         self.training_accuracy = MulticlassAccuracy(num_classes=num_classes)
20 |         self.validation_accuracy = MulticlassAccuracy(num_classes=num_classes)
21 | 
22 |     @torch.compile
23 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
24 |         return self.model(x)  # type: ignore
25 | 
26 |     def training_step(self, batch: tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> torch.Tensor:
27 |         x, y = batch
28 | 
29 |         logits = self(x)
30 | 
31 |         loss = fn.cross_entropy(logits, y)
32 | 
33 |         self.log_dict(
34 |             {
35 |                 "training/loss": loss,
36 |                 "training/accuracy": self.training_accuracy(logits, y),
37 |             },
38 |             prog_bar=True,
39 |         )
40 | 
41 |         return loss
42 | 
43 |     def validation_step(self, batch: tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> None:
44 |         x, y = batch
45 | 
46 |         logits = self(x)
47 | 
48 |         loss = fn.cross_entropy(logits, y)
49 | 
50 |         self.log_dict(
51 |             {
52 |                 "validation/loss": loss,
53 |                 "validation/accuracy": self.validation_accuracy(logits, y),
54 |             },
55 |             prog_bar=True,
56 |         )
57 | 


--------------------------------------------------------------------------------
/aiaccel/torch/datasets/file_cached_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | from typing import Any, TypeVar
 5 | 
 6 | from multiprocessing import Manager
 7 | from pathlib import Path
 8 | import pickle as pkl
 9 | import uuid
10 | 
11 | from torch.utils.data import Dataset
12 | 
13 | __all__ = ["FileCachedDataset"]
14 | 
15 | 
16 | T_co = TypeVar("T_co", covariant=True)
17 | 
18 | 
19 | class FileCachedDataset(Dataset[T_co]):
20 |     """
21 |     A dataset wrapper that caches samples to disk to reduce memory usage.
22 | 
23 |     This class wraps an existing `torch.utils.data.Dataset` and caches samples
24 |     as pickle files in a specified directory.
25 | 
26 |     Args:
27 |         dataset (Dataset[T]): The dataset to wrap.
28 |         cache_path (str | Path): Directory where cached samples will be stored.
29 | 
30 |     Methods:
31 |         __len__(): Returns the number of samples in the dataset.
32 |         __getitem__(index: int) -> Any: Retrieves a sample from cache or the original dataset.
33 |     """
34 | 
35 |     def __init__(self, dataset: Dataset[T_co], cache_path: str | Path) -> None:
36 |         self.dataset = dataset
37 | 
38 |         self.manager = Manager()
39 |         self.cache = self.manager.dict()
40 | 
41 |         self.cache_path = Path(cache_path)
42 |         self.cache_path.mkdir(exist_ok=True, parents=True)
43 | 
44 |     def __len__(self) -> int:
45 |         return len(self.dataset)  # type: ignore[arg-type]
46 | 
47 |     def __getitem__(self, index: int) -> Any:
48 |         if index not in self.cache:
49 |             sample = self.dataset[index]
50 | 
51 |             self.cache[index] = self.cache_path / f"cache-{uuid.uuid4()}.pkl"
52 |             with open(self.cache[index], "wb") as f:
53 |                 pkl.dump(sample, f)
54 |         else:
55 |             with open(self.cache[index], "rb") as f:
56 |                 sample = pkl.load(f)
57 | 
58 |         return sample
59 | 


--------------------------------------------------------------------------------
/tests/torch/datasets/test_hdf5_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | from pathlib import Path
 5 | 
 6 | import numpy as np
 7 | 
 8 | import torch
 9 | 
10 | import h5py as h5
11 | 
12 | from aiaccel.torch.datasets.hdf5_dataset import HDF5Dataset, RawHDF5Dataset
13 | 
14 | # with h5.File(Path(__file__).parent / "test_hdf5_dataset_assets" / "dataset.hdf5", "w") as f:
15 | #     for ii in range(10):
16 | #         g = f.create_group(f"grp{ii}")  # noqa: ERA001
17 | #         g.create_dataset("foo", [2, 3, 4])  # noqa: ERA001
18 | #         g.create_dataset("bar", [5, 6])  # noqa: ERA001
19 | 
20 | #         g["foo"][:] = np.random.randn(2, 3, 4)  # noqa: ERA001
21 | #         g["bar"][:] = np.random.randn(5, 6)  # noqa: ERA001
22 | 
23 | 
24 | def test_raw_hdf5_dataset() -> None:
25 |     hdf5_filename = Path(__file__).parent / "test_hdf5_dataset_assets" / "dataset.hdf5"
26 |     f_hdf5 = h5.File(hdf5_filename)
27 | 
28 |     dataset = RawHDF5Dataset(hdf5_filename)
29 | 
30 |     assert len(dataset) == 10
31 |     assert list(dataset.grp_list) == [f"grp{idx}" for idx in range(10)]
32 | 
33 |     sample = dataset[5]
34 |     assert sorted(sample.keys()) == ["bar", "foo"]
35 |     assert np.array_equal(sample["bar"], f_hdf5["grp5"]["bar"][:])
36 |     assert np.array_equal(sample["foo"], f_hdf5["grp5"]["foo"][:])
37 | 
38 | 
39 | def test_hdf5_dataset() -> None:
40 |     hdf5_filename = Path(__file__).parent / "test_hdf5_dataset_assets" / "dataset.hdf5"
41 |     f_hdf5 = h5.File(hdf5_filename)
42 | 
43 |     dataset = HDF5Dataset(hdf5_filename)
44 | 
45 |     assert len(dataset) == 10
46 |     assert list(dataset.grp_list) == [f"grp{idx}" for idx in range(10)]
47 | 
48 |     sample = dataset[5]
49 |     assert sorted(sample.keys()) == ["bar", "foo"]
50 |     assert isinstance(sample["bar"], torch.Tensor)
51 |     assert isinstance(sample["foo"], torch.Tensor)
52 |     assert np.array_equal(sample["bar"].numpy(), f_hdf5["grp5"]["bar"][:])
53 |     assert np.array_equal(sample["foo"].numpy(), f_hdf5["grp5"]["foo"][:])
54 | 


--------------------------------------------------------------------------------
/aiaccel/job/apps/config/pbs.yaml:
--------------------------------------------------------------------------------
 1 | walltime: "1:0:0"
 2 | 
 3 | script_prologue: |
 4 |     echo Job ID: $PBS_JOBID
 5 |     echo Hostname: $(hostname)
 6 | 
 7 |     export CUDA_VISIBLE_DEVICES=all
 8 | 
 9 | qsub: "qsub -P $JOB_GROUP -l walltime={args.walltime} -v USE_SSH=1"
10 | 
11 | cpu:
12 |     qsub_args: "-q rt_HF -l select=1"
13 |     job: "{command}"
14 | 
15 | cpu-array:
16 |     n_tasks_per_proc: 128
17 |     n_procs: 24
18 |     qsub_args: "-q rt_HF -l select=1 -J 1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))"
19 |     job: "{command}"
20 | 
21 | gpu:
22 |     qsub_args: "-q rt_HF -l select=1"
23 |     job: "{command}"
24 | 
25 | gpu-array:
26 |     n_tasks_per_proc: 128
27 |     n_procs: 8
28 |     qsub_args: "-q rt_HF -l select=1 -J 1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))"
29 |     job: "CUDA_VISIBLE_DEVICES=$(( LOCAL_PROC_INDEX % 8 )) {command}"
30 | 
31 | mpi:
32 |     n_nodes: 1
33 |     qsub_args: >-
34 |         -q rt_HF
35 |         -l select={args.n_nodes}:mpiprocs=$(( {args.n_procs} / {args.n_nodes} )):ompthreads=$(( {args.n_nodes} * 96 / {args.n_procs} ))
36 |     job: |
37 |         source /etc/profile.d/modules.sh
38 |         module load hpcx
39 | 
40 |         mpirun -np {args.n_procs} -bind-to none -map-by slot \\
41 |             -mca pml ob1 -mca btl self,tcp -mca btl_tcp_if_include bond0 \\
42 |             {command}
43 | 
44 | train:
45 |     qsub_args: >-
46 |         -q $( (({args.n_gpus}==1)) && printf rt_HG || printf rt_HF )
47 |         -l select=$(( ({args.n_gpus} + 7) / 8 )):mpiprocs=$( (({args.n_gpus}==1)) && printf 1 || printf 8 ):ompthreads=$( (({args.n_gpus}==1)) && printf 8 || printf 12 )
48 |     job: |
49 |         source /etc/profile.d/modules.sh
50 |         module load hpcx
51 | 
52 |         mpirun -np {args.n_gpus} -bind-to none -map-by slot \\
53 |             -mca pml ob1 -mca btl self,tcp -mca btl_tcp_if_include bond0 \\
54 |             -x MAIN_ADDR=$(hostname -i) \\
55 |             -x MAIN_PORT=3000 \\
56 |             -x COLUMNS=120 \\
57 |             -x PYTHONUNBUFFERED=true \\
58 |             {command}
59 | 
60 | use_scandir: False
61 | 


--------------------------------------------------------------------------------
/tests/hpo/optuna/samplers/results_ackley_logscale.csv:
--------------------------------------------------------------------------------
 1 | x,y,objective
 2 | 0.05564180225431373,32147.193482816965,20.814962543634394
 3 | 208.90047049266641,9.695826644515218,21.448998676398375
 4 | 0.0003632339256943143,0.0003630322466779861,0.0014595551233256288
 5 | 0.9690955677354745,5.755253900459714,12.296378779985623
 6 | 0.03231014054617535,38.33310112612593,21.3581260787541
 7 | 0.018761886564029658,0.04570932895685663,0.20359442477314715
 8 | 0.004495663495585621,3.4162066498470134,9.311384187577072
 9 | 0.0034258048967100124,0.1179667403294333,0.6654758825765974
10 | 0.0019893000084822924,0.00014066643139422686,0.005746539690054675
11 | 0.0039935573463010235,0.0032139294641708098,0.015198674098908072
12 | 0.00018093674105720218,1.5889101235890364e-05,0.0005146143530829761
13 | 3.303793418604823e-05,4.1006628675986886e-05,0.00014901809147715994
14 | 0.000167582287432558,9.626334554501767e-05,0.000547623923871754
15 | 0.00011382796427517295,4.95699892804902e-05,0.0003515684113644113
16 | 2.078428499530732e-05,0.00012793034128993056,0.00036703327257470164
17 | 3.570121591230095e-05,7.594601695415612e-05,0.00023754589410174276
18 | 1.0362079557367228e-05,6.28261204382591e-05,0.00018020780579774964
19 | 2.5701380739833768e-05,6.208705753197202e-05,0.000190180515296845
20 | 1.3319973191802473e-05,4.149475743214036e-05,0.00012331408252919118
21 | 4.2468733735770256e-05,2.7083641296808898e-05,0.0001425348740156096
22 | 1.712220841850161e-05,2.7406035616097617e-05,9.142849883048143e-05
23 | 1.2326314004310186e-05,2.240486306276013e-05,7.234532742117494e-05
24 | 2.332753235476556e-05,2.8736733977299986e-05,0.00010472549182694024
25 | 2.158731737744854e-05,1.551623938723579e-05,7.521275362165625e-05
26 | 1.1406780985592453e-05,1.2097380968715272e-05,4.7035972471576315e-05
27 | 1.5999205650361907e-05,1.5982755233483398e-05,6.397754858866733e-05
28 | 1.2904227006156957e-05,1.7650495365273593e-05,6.185504950195764e-05
29 | 1.3932305367519067e-05,1.5282270702374627e-05,5.850288671993553e-05
30 | 1.2315557985445971e-05,1.0474235817618713e-05,4.573507709437763e-05
31 | 1.2850374704515058e-05,1.3115878062101893e-05,5.194419819432028e-05
32 | 


--------------------------------------------------------------------------------
/examples/hpo/benchmark/job_config.yaml:
--------------------------------------------------------------------------------
 1 | walltime: "1:0:0"
 2 | 
 3 | script_prologue: |
 4 |     echo Job ID: $PBS_JOBID
 5 |     echo Hostname: $(hostname)
 6 | 
 7 |     export NVIDIA_VISIBLE_DEVICES=all
 8 |     export JOB_GROUP=job_group
 9 | 
10 |     # activate environment
11 | 
12 | qsub: "qsub -P $JOB_GROUP -l walltime={args.walltime} -v USE_SSH=1"
13 | 
14 | cpu:
15 |     qsub_args: "-q rt_HF -l select=1"
16 |     job: "{command}"
17 | 
18 | cpu-array:
19 |     n_tasks_per_proc: 128
20 |     n_procs: 24
21 |     qsub_args: "-q rt_HF -l select=1 -J 1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))"
22 |     job: "{command}"
23 | 
24 | gpu:
25 |     qsub_args: "-q rt_HF -l select=1"
26 |     job: "{command}"
27 | 
28 | gpu-array:
29 |     n_tasks_per_proc: 1
30 |     n_procs: 1
31 |     qsub_args: "-q rt_HF -l select=1 -J 1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))"
32 |     job: "CUDA_VISIBLE_DEVICES=$(( LOCAL_PROC_INDEX % 8 )) {command}"
33 | 
34 | mpi:
35 |     n_nodes: 1
36 |     qsub_args: >-
37 |         -q rt_HF
38 |         -l select={args.n_nodes}:mpiprocs=$(( {args.n_procs} / {args.n_nodes} )):ompthreads=$(( {args.n_nodes} * 96 / {args.n_procs} ))
39 |     job: |
40 |         source /etc/profile.d/modules.sh
41 |         module load hpcx
42 | 
43 |         mpirun -np {args.n_procs} -bind-to none -map-by slot \\
44 |             -mca pml ob1 -mca btl self,tcp -mca btl_tcp_if_include bond0 \\
45 |             {command}
46 | 
47 | train:
48 |     qsub_args: >-
49 |         -q $( (({args.n_gpus}==1)) && printf rt_HG || printf rt_HF )
50 |         -l select=$(( ({args.n_gpus} + 7) / 8 )):mpiprocs=$( (({args.n_gpus}==1)) && printf 1 || printf 8 ):ompthreads=$( (({args.n_gpus}==1)) && printf 8 || printf 12 )
51 |     job: |
52 |         source /etc/profile.d/modules.sh
53 |         module load hpcx
54 | 
55 |         mpirun -np {args.n_gpus} -bind-to none -map-by slot \\
56 |             -mca pml ob1 -mca btl self,tcp -mca btl_tcp_if_include bond0 \\
57 |             -x MAIN_ADDR=$(hostname -i) \\
58 |             -x MAIN_PORT=3000 \\
59 |             -x COLUMNS=120 \\
60 |             -x PYTHONUNBUFFERED=true \\
61 |             {command}
62 | 


--------------------------------------------------------------------------------
/aiaccel/torch/apps/train.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | from argparse import ArgumentParser
 5 | import logging
 6 | import os
 7 | 
 8 | from hydra.utils import instantiate
 9 | from omegaconf import OmegaConf as oc  # noqa: N813
10 | 
11 | import lightning as lt
12 | 
13 | from aiaccel.config import (
14 |     prepare_config,
15 | )
16 | from aiaccel.config.git import collect_git_status_from_config, print_git_status
17 | 
18 | logger = logging.getLogger(__name__)
19 | 
20 | 
21 | def get_rank(default: int = 0) -> int:
22 |     for key in [
23 |         "GLOBAL_RANK",  # PyTorch Lightning
24 |         "RANK",  # torchrun / deepspeed / pytorch launcher
25 |         "OMPI_COMM_WORLD_RANK",  # OpenMPI
26 |         "PMI_RANK",  # MPICH / Intel MPI
27 |         "MV2_COMM_WORLD_RANK",  # MVAPICH2
28 |         "SLURM_PROCID",  # Slurm
29 |     ]:
30 |         rank = os.environ.get(key)
31 |         if rank is not None:
32 |             try:
33 |                 return int(rank)
34 |             except ValueError:
35 |                 pass
36 | 
37 |     return default
38 | 
39 | 
40 | def main() -> None:
41 |     parser = ArgumentParser()
42 |     parser.add_argument("config", type=str, help="Config file in YAML format")
43 |     args, unk_args = parser.parse_known_args()
44 | 
45 |     is_rank_zero = get_rank() == 0
46 |     config = prepare_config(
47 |         config_filename=args.config,
48 |         overwrite_config=oc.from_cli(unk_args),
49 |         print_config=is_rank_zero,
50 |         save_config=is_rank_zero,
51 |         save_filename="merged_config.yaml",
52 |     )
53 | 
54 |     if is_rank_zero:
55 |         status_list = collect_git_status_from_config(config)
56 |         print_git_status(status_list)
57 | 
58 |     if "seed" in config:
59 |         lt.seed_everything(config.seed, workers=True)
60 | 
61 |     # build trainer
62 |     trainer: lt.Trainer = instantiate(config.trainer)
63 | 
64 |     # start training
65 |     trainer.fit(
66 |         model=instantiate(config.task),
67 |         datamodule=instantiate(config.datamodule),
68 |     )
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     main()
73 | 


--------------------------------------------------------------------------------
/docs/source/api_reference/torch.rst:
--------------------------------------------------------------------------------
  1 | ###########################
  2 |  PyTorch/Lightning Toolkit
  3 | ###########################
  4 | 
  5 | **********
  6 |  Datasets
  7 | **********
  8 | 
  9 | .. currentmodule:: aiaccel.torch.datasets
 10 | 
 11 | .. autosummary::
 12 |     :toctree: generated/
 13 | 
 14 |     CachedDataset
 15 |     FileCachedDataset
 16 |     HDF5Dataset
 17 |     RawHDF5Dataset
 18 |     scatter_dataset
 19 | 
 20 | ************
 21 |  Functional
 22 | ************
 23 | 
 24 | .. currentmodule:: aiaccel.torch.functional
 25 | 
 26 | .. autosummary::
 27 |     :toctree: generated/
 28 | 
 29 |     linear_sum_assignment
 30 | 
 31 | **************************
 32 |  Learning Rate Schedulers
 33 | **************************
 34 | 
 35 | .. currentmodule:: aiaccel.torch.lr_schedulers
 36 | 
 37 | .. autosummary::
 38 |     :toctree: generated/
 39 | 
 40 |     SequentialLR
 41 | 
 42 | ****************************
 43 |  Inference Pipeline Helpers
 44 | ****************************
 45 | 
 46 | .. currentmodule:: aiaccel.torch.pipelines
 47 | 
 48 | .. autosummary::
 49 |     :toctree: generated/
 50 | 
 51 |     BasePipeline
 52 |     reorder_fields
 53 | 
 54 | *********************
 55 |  Lightning Utilities
 56 | *********************
 57 | 
 58 | .. currentmodule:: aiaccel.torch.lightning
 59 | 
 60 | .. autosummary::
 61 |     :toctree: generated/
 62 | 
 63 |     OptimizerLightningModule
 64 |     OptimizerConfig
 65 |     build_param_groups
 66 |     load_checkpoint
 67 |     ABCIEnvironment
 68 | 
 69 | Lightning Datamodules
 70 | =====================
 71 | 
 72 | .. currentmodule:: aiaccel.torch.lightning.datamodules
 73 | 
 74 | .. autosummary::
 75 |     :toctree: generated/
 76 | 
 77 |     SingleDataModule
 78 | 
 79 | Lightning Callbacks
 80 | ===================
 81 | 
 82 | .. currentmodule:: aiaccel.torch.lightning.callbacks
 83 | 
 84 | .. autosummary::
 85 |     :toctree: generated/
 86 | 
 87 |     SaveMetricCallback
 88 |     LoadPretrainedCallback
 89 |     PrintUnusedParam
 90 | 
 91 | ****************
 92 |  H5py Utilities
 93 | ****************
 94 | 
 95 | .. currentmodule:: aiaccel.torch.h5py
 96 | 
 97 | .. autosummary::
 98 |     :toctree: generated/
 99 | 
100 |     HDF5Writer
101 | 


--------------------------------------------------------------------------------
/tests/hpo/optuna/samplers/results_shpere_parallel.csv:
--------------------------------------------------------------------------------
 1 | x,y,z,objective
 2 | -7.527592869158251,27.04285838459497,13.919636508684306,981.7371245469577
 3 | 5.919509051822196,-20.63888157345381,-20.64032877982784,887.0271921570455
 4 | -26.514983269908033,21.970568746496113,6.066900704592527,1222.5575130073998
 5 | 12.48435466776273,-28.76493034225185,28.19459112971966,1778.215298037157
 6 | 1.5549994860073504,-9.653374244853046,13.988330303767995,291.27904239998924
 7 | 26.479593715688896,-24.13683370230404,-1.2218086828428865,1285.2484409781014
 8 | -13.2663390235088,10.443718134296077,4.244723357733673,303.084675938987
 9 | -4.729101515525668,10.213339578295692,6.558605734621125,169.6920156679335
10 | 0.2196810170732446,-8.485493542103784,-6.188221157226787,110.34594149309659
11 | 11.296725015212084,-15.727403606736836,5.327753229707882,403.35217475518004
12 | -7.125573013828579,3.900937699037849,4.515480825727225,86.38067279468704
13 | -9.31166182752802,13.405896735006218,-10.731086701686952,381.5813350590154
14 | -1.1616658423764918,-3.8885564998882307,7.808476052404258,77.44263744313807
15 | -0.6492703772288824,-15.862081140265136,-2.4681152540179947,258.1187630302182
16 | -3.709143730951472,3.6944843986554847,4.301925487461345,45.91352508843491
17 | -8.21726940851094,10.956737273973852,17.27214273428867,485.9005228565786
18 | -1.8895565893228015,-3.624935838084375,-0.32313018434792173,16.814997050518407
19 | 2.6186622387280694,-6.446942991915929,3.3427000779512293,59.59410967268937
20 | -0.8250262119876444,-0.3630397876749818,-2.9274791316944886,9.382600204408494
21 | -0.6567063967932206,1.3997185684316427,-8.295456723743861,71.2050776179087
22 | -6.901146593569348,6.251282173846681,-2.6418226303386065,93.6835799331563
23 | 0.23871003065371532,-3.2723867004752765,1.8465694008787705,14.175315748443946
24 | 2.058561883846985,-8.534725949478572,-5.237952097570439,104.51536623877313
25 | -2.2672173272518577,0.6371818116219701,1.9169560912033994,9.220995725654728
26 | -0.012799083067722927,1.6261060537321832,0.880494424606376,3.4196551462747413
27 | 0.9255796700598163,4.251626999640462,1.4823067290835248,21.130263108786092
28 | -2.308738445525199,4.539218752261391,-1.9332551448019129,29.672255545630637
29 | -0.39815208839101324,-1.3194853372911095,0.9016132644585998,2.7124731194640512
30 | -0.9604194538194184,0.9922414730503442,5.393521651873405,30.99702447734313
31 | -0.8588745224455878,-0.02421947249365025,-0.8472289358025149,1.4560488978150692
32 | 


--------------------------------------------------------------------------------
/aiaccel/job/apps/local.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | 
 3 | 
 4 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 5 | # SPDX-License-Identifier: MIT
 6 | 
 7 | import logging
 8 | from math import ceil
 9 | from pathlib import Path
10 | import shlex
11 | import subprocess
12 | 
13 | from aiaccel.job.apps import prepare_argument_parser
14 | 
15 | logger = logging.getLogger(__name__)
16 | 
17 | 
18 | def main() -> None:
19 |     # Load configuration (from the default YAML string)
20 |     config, parser, sub_parsers = prepare_argument_parser("local.yaml")
21 | 
22 |     args = parser.parse_args()
23 |     mode = args.mode + "-array" if getattr(args, "n_tasks", None) is not None else args.mode
24 | 
25 |     for key in ["walltime", "n_nodes", "n_tasks_per_proc"]:
26 |         if getattr(args, key, None) is not None:
27 |             logger.warning(f"Argument '{key}' is defined for compatibility and will not be used in aiaccel-job local.")
28 | 
29 |     # Prepare the job script and arguments
30 |     job = config[mode].job.format(command=shlex.join(args.command), args=args)
31 | 
32 |     if mode in ["cpu-array", "gpu-array"]:
33 |         n_tasks_per_proc = ceil(args.n_tasks / args.n_procs)
34 |         job = f"""\
35 | for LOCAL_PROC_INDEX in {{1..{args.n_procs}}}; do
36 |     TASK_INDEX=$(( 1 + {n_tasks_per_proc} * (LOCAL_PROC_INDEX - 1) ))
37 | 
38 |     if [[ $TASK_INDEX -gt {args.n_tasks} ]]; then
39 |         break
40 |     fi
41 | 
42 |     TASK_INDEX=$TASK_INDEX \\
43 |     TASK_STEPSIZE={n_tasks_per_proc} \\
44 |         {job} 2>&1 | tee {args.log_filename.with_suffix("")}.${{LOCAL_PROC_INDEX}}.log &
45 | 
46 |     pids[$LOCAL_PROC_INDEX]=$!
47 | done
48 | 
49 | for i in "${{!pids[@]}}"; do
50 |     wait ${{pids[$i]}}
51 | done
52 | """
53 |     else:
54 |         job = f"{job} 2>&1 | tee {args.log_filename}"
55 | 
56 |     job_script = f"""\
57 | #! /bin/bash
58 | 
59 | set -eE -o pipefail
60 | trap 'exit $?' ERR EXIT  # at error and exit
61 | trap 'echo 143' TERM  # at termination (by job scheduler)
62 | trap 'kill 0' INT
63 | 
64 | 
65 | {config.script_prologue}
66 | 
67 | {job}
68 | """
69 | 
70 |     # Create the job script file, remove old status files, and run the job
71 |     args.log_filename.parent.mkdir(exist_ok=True, parents=True)
72 | 
73 |     job_filename: Path = args.log_filename.with_suffix(".sh")
74 |     with open(job_filename, "w") as f:
75 |         f.write(job_script)
76 | 
77 |     subprocess.run(f"bash {job_filename}", shell=True, check=True)
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     main()
82 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. figure:: _static/logo_aiaccel.png
 2 |     :width: 600px
 3 | 
 4 | #######################
 5 |  Aiaccel Documantation
 6 | #######################
 7 | 
 8 | Aiaccel is a toolkit for accelerating machine learning research.
 9 | 
10 | .. grid:: 1 1 2 2
11 |     :gutter: 2
12 |     :padding: 0
13 |     :class-row: surface
14 | 
15 |     .. grid-item-card:: :octicon:`sliders;1.5em;sd-text-primary` Configuration management
16 |        :link: user_guide/config.html
17 | 
18 |        OmegaConf-based config management.
19 | 
20 | 
21 |     .. grid-item-card:: :octicon:`server;1.5em;sd-text-primary` Job management
22 |        :link: user_guide/config.html
23 | 
24 |        HPC-oriented job abstraction.
25 | 
26 |     .. grid-item-card:: :octicon:`flame;1.5em;sd-text-primary` PyTorch/Lightning toolkit
27 |        :link: user_guide/torch.html
28 | 
29 |        Training toolkit for HPC clusters.
30 | 
31 |     .. grid-item-card:: :octicon:`beaker;1.5em;sd-text-primary` Hyperparameter optimization
32 |        :link: user_guide/hpo.html
33 | 
34 |        Ready-to-use algorithms/tools.
35 | 
36 | **************
37 |  Key Features
38 | **************
39 | 
40 | :octicon:`zap;1em` Research-Oriented
41 |     Designed to accelerate your research cycles written in Python
42 | 
43 | :octicon:`cpu;1em` HPC Optimized
44 |     Intended to use in high-performance computing (HPC) clusters, including AI Bridging
45 |     Cloud Infrastructure (ABCI).
46 | 
47 | :octicon:`server;1em` Highly Modular
48 |     Designed to let you pick up any part of aiaccel for your research project.
49 | 
50 | ************************
51 |  Aiaccel is used in ...
52 | ************************
53 | 
54 | - M3L: Multimodal machine listening toolkit (https://github.com/b-sigpro/m3l)
55 | - SBSS: Scalable blind source separation toolkit (https://github.com/b-sigpro/sbss)
56 | 
57 | *****************
58 |  Acknowledgments
59 | *****************
60 | 
61 | - Part of this work was developed under a commissioned project of the New Energy and
62 |   Industrial Technology Development Organization (NEDO).
63 | - Part of this software was developed by using ABCI 3.0 provided by AIST and AIST
64 |   Solutions.
65 | - Part of this software was developed by using the TSUBAME4.0 supercomputer at Institute
66 |   of Science Tokyo.
67 | 
68 | .. toctree::
69 |     :hidden:
70 | 
71 |     user_guide/index.rst
72 | 
73 | .. toctree::
74 |     :hidden:
75 |     :maxdepth: 2
76 | 
77 |     api_reference/index.rst
78 | 
79 | .. toctree::
80 |     :hidden:
81 |     :maxdepth: 2
82 | 
83 |     contribution_guide/index.rst
84 | 


--------------------------------------------------------------------------------
/aiaccel/torch/lightning/abci_environment.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | import logging
 5 | import os
 6 | 
 7 | from lightning.fabric.plugins.environments.cluster_environment import ClusterEnvironment
 8 | 
 9 | log = logging.getLogger(__name__)
10 | 
11 | 
12 | class ABCIEnvironment(ClusterEnvironment):
13 |     """
14 |     Environment class for ABCI.
15 | 
16 |     This class provides methods to interact with the ABCI environment,
17 |     such as retrieving the world size, global rank, node rank, and local rank.
18 |     """
19 | 
20 |     def __init__(self) -> None:
21 |         self._world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"])
22 |         self._rank = int(os.environ["OMPI_COMM_WORLD_RANK"])
23 |         self._local_rank = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"])
24 |         self._local_size = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"])
25 | 
26 |         self._main_address = os.environ["MAIN_ADDR"]
27 |         self._main_port = int(os.environ["MAIN_PORT"])
28 | 
29 |     @property
30 |     def creates_processes_externally(self) -> bool:
31 |         return True
32 | 
33 |     @property
34 |     def main_address(self) -> str:
35 |         return self._main_address
36 | 
37 |     @property
38 |     def main_port(self) -> int:
39 |         return self._main_port
40 | 
41 |     @staticmethod
42 |     def detect() -> bool:
43 |         return True
44 | 
45 |     def world_size(self) -> int:
46 |         return self._world_size
47 | 
48 |     def global_rank(self) -> int:
49 |         return self._rank
50 | 
51 |     def node_rank(self) -> int:
52 |         return self._rank // self._local_size
53 | 
54 |     def local_rank(self) -> int:
55 |         return self._local_rank
56 | 
57 |     def set_world_size(self, size: int) -> None:
58 |         if size != self.world_size():
59 |             raise ValueError(f"`size` is expected to be {self.world_size()}, buf {size} is given.")
60 | 
61 |     def set_global_rank(self, rank: int) -> None:
62 |         if rank != self.global_rank():
63 |             raise ValueError(f"`rank` is expected to be {self.global_rank()}, buf {rank} is given.")
64 | 
65 |     def validate_settings(self, num_devices: int, num_nodes: int) -> None:
66 |         if num_devices != self._local_size:
67 |             raise ValueError("`num_devices` should match ${OMPI_COMM_WORLD_LOCAL_SIZE}")
68 | 
69 |         if num_devices * num_nodes != self._world_size:
70 |             raise ValueError("`num_devices * num_nodes` should match ${OMPI_COMM_WORLD_SIZE}")
71 | 


--------------------------------------------------------------------------------
/aiaccel/hpo/optuna/hparams_manager.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | from typing import Any
 5 | 
 6 | from collections.abc import Callable
 7 | 
 8 | from optuna.trial import Trial
 9 | 
10 | from aiaccel.hpo.optuna.hparams import Const, Float, Hparam, T
11 | 
12 | 
13 | class HparamsManager:
14 |     """
15 |     Manages hyperparameters for optimization.
16 |     This class allows defining hyperparameters with various types and provides
17 |     a method to suggest hyperparameters for a given trial.
18 |     Attributes:
19 |         params (dict): A dictionary where keys are hyperparameter names and values
20 |                        are callables that take a Trial object and return a hyperparameter value.
21 |     Methods:
22 |         __init__(**params_def: dict[str, int | float | str | list[int | float] | Hparam[T]]) -> None:
23 |             Initializes the HparamsManager with the given hyperparameter definitions.
24 |         suggest_hparams(trial: Trial) -> dict[str, float | int | str | list[float | int | str]]:
25 |             Suggests hyperparameters for the given trial.
26 |     """
27 | 
28 |     def __init__(self, **params_def: dict[str, int | float | str | list[int | float] | Hparam[T]]) -> None:
29 |         self.params: dict[str, Callable[[Trial, str], Any]] = {}
30 |         for name, param in params_def.items():
31 |             if callable(param):
32 |                 self.params[name] = param
33 |             else:
34 |                 if isinstance(param, list):
35 |                     low, high = param
36 |                     self.params[name] = Float(low=low, high=high)
37 |                 else:
38 |                     self.params[name] = Const(value=param)
39 | 
40 |     def suggest_hparams(self, trial: Trial) -> dict[str, float | int | str | list[float | int | str]]:
41 |         """
42 |         Suggests hyperparameters for a given trial.
43 |         This method generates a dictionary of hyperparameters by applying the
44 |         parameter functions stored in `self.params` to the provided trial.
45 |         Args:
46 |             trial (Trial): An Optuna trial object used to suggest hyperparameters.
47 |         Returns:
48 |             dict[str, float | int | str | list[float | int | str]]: A dictionary
49 |             where keys are parameter names and values are the suggested
50 |             hyperparameters, which can be of type float, int, str, or a list of
51 |             these types.
52 |         """
53 | 
54 |         return {name: param_fn(trial, name) for name, param_fn in self.params.items()}
55 | 


--------------------------------------------------------------------------------
/examples/hpo/benchmark/README.md:
--------------------------------------------------------------------------------
 1 | # Verification Code for NelderMeadSampler Using COCO
 2 | 
 3 | ## 1. File Structure
 4 | 
 5 | ### nelder-mead
 6 | ### nelder-mead-subTPE
 7 | ### TPE
 8 | 
 9 | - These directories store the CSV files containing the optimization results for each sampler.
10 | 
11 | ### experiment_coco.py
12 | 
13 | - This is the main code for validation using COCO.
14 | - It is designed to run with dimensions * 20 steps and 10 parallel executions.
15 | - Upon execution, the results from Optuna are output to `optuna_csv`, and results for each parallel step are output to `step_csv`.
16 | 
17 | ### main_parallel_coco.py
18 | 
19 | - This code uses `job_dispatcher` to submit jobs for each sampler, function, and dimension.
20 | 
21 | ### objective.sh
22 | 
23 | - This is a script for qsub submission used by `job_dispatcher`.
24 | 
25 | ### plot.py
26 | 
27 | - This code uses matplotlib to graph the results of each sampler.
28 | - It references the `optuna_csv` files in each sampler's directory.
29 | 
30 | ### result_bbob_dim_vs_value-fopt_parallel.png
31 | 
32 | - This is a graph image visualizing the validation results output by running `plot.py`.
33 | - The graph displays 24 benchmark functions with the number of dimensions on the horizontal axis and the mean and standard deviation of optimization results on the vertical axis.
34 | 
35 | ## 2. Execution Instructions
36 | 
37 | - Install aiaccel and activate the virtual environment.
38 | 
39 | - Install COCO.
40 |   - For details, please refer to the following GitHub repository:
41 |     https://github.com/numbbo/coco
42 | 
43 | - Please replace the # activate environment and the `job_group` in `job_config.py` with the appropriate commands and IDs.
44 | - When you run above command on ABCI, the validation for each sampler will be executed.
45 | 
46 | ```
47 | aiaccel-job pbs --config job_config.yaml cpu --walltime 4:00:00 main_parallel_coco.log -- python3.13 main_parallel_coco.py
48 | ```
49 | 
50 | - The results are saved in `optuna_csv` and `step_csv` under each directory.
51 | 
52 | - To run `plot.py`, you need to install pandas and matplotlib.
53 | 
54 | ```
55 | pip install pandas matplotlib
56 | python plot.py
57 | ```
58 | 
59 | ## 3. Checking the Results
60 | 
61 | - The validation results for each sampler are output to `optuna_csv` and `step_csv` under the corresponding sampler's directory.
62 | - The visualization results from `plot.py` are output to `result_bbob_dim_vs_value-fopt_parallel.png`.
63 |   - From the visualization results, it can be observed that `nelder-mead_subTPE` tends to yield better results during parallel execution. However, in some cases, `nelder-mead` may perform better depending on the function.
64 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # tentative
  2 | /pixi.lock
  3 | 
  4 | # examples
  5 | miniforge3/
  6 | Miniforge*.sh
  7 | data/
  8 | checkpoints/
  9 | events*
 10 | hparams.yaml
 11 | merged_config.yaml
 12 | trial*.json
 13 | *.db
 14 | 
 15 | # Byte-compiled / optimized / DLL files
 16 | __pycache__/
 17 | *.py[cod]
 18 | *$py.class
 19 | 
 20 | # C extensions
 21 | *.so
 22 | 
 23 | # Distribution / packaging
 24 | .Python
 25 | build/
 26 | develop-eggs/
 27 | dist/
 28 | downloads/
 29 | eggs/
 30 | .eggs/
 31 | # lib/
 32 | lib64/
 33 | parts/
 34 | sdist/
 35 | var/
 36 | wheels/
 37 | *.egg-info/
 38 | .installed.cfg
 39 | *.egg
 40 | MANIFEST
 41 | 
 42 | # PyInstaller
 43 | #  Usually these files are written by a python script from a template
 44 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 45 | *.manifest
 46 | *.spec
 47 | 
 48 | # Installer logs
 49 | pip-log.txt
 50 | pip-delete-this-directory.txt
 51 | 
 52 | # Unit test / coverage reports
 53 | htmlcov/
 54 | .tox/
 55 | .coverage
 56 | .coverage.*
 57 | .cache
 58 | nosetests.xml
 59 | coverage.xml
 60 | *.cover
 61 | .hypothesis/
 62 | .pytest_cache/
 63 | 
 64 | # Translations
 65 | *.mo
 66 | *.pot
 67 | 
 68 | # Django stuff:
 69 | *.log
 70 | local_settings.py
 71 | db.sqlite3
 72 | 
 73 | # Flask stuff:
 74 | instance/
 75 | .webassets-cache
 76 | 
 77 | # Scrapy stuff:
 78 | .scrapy
 79 | 
 80 | # Sphinx documentation
 81 | docs/_build/
 82 | 
 83 | # PyBuilder
 84 | target/
 85 | 
 86 | # Jupyter Notebook
 87 | .ipynb_checkpoints
 88 | 
 89 | # pyenv
 90 | .python-version
 91 | 
 92 | # celery beat schedule file
 93 | celerybeat-schedule
 94 | 
 95 | # SageMath parsed files
 96 | *.sage.py
 97 | 
 98 | # Environments
 99 | .env
100 | .venv
101 | env/
102 | venv/
103 | ENV/
104 | env.bak/
105 | venv.bak/
106 | 
107 | # Spyder project settings
108 | .spyderproject
109 | .spyproject
110 | 
111 | # Rope project settings
112 | .ropeproject
113 | 
114 | # mkdocs documentation
115 | /site
116 | 
117 | # mypy
118 | .mypy_cache/
119 | 
120 | # application workspace
121 | /work/
122 | work_aiaccel/
123 | 
124 | # vscode
125 | .vscode
126 | *.code-workspace
127 | 
128 | # document
129 | docs/build/html
130 | docs/source/api_reference/
131 | !docs/source/api_reference/index.rst
132 | !docs/source/api_reference/torch.rst
133 | !docs/source/api_reference/hpo.rst
134 | 
135 | # IntelliJ
136 | .idea
137 | 
138 | # Docker
139 | .devcontainer
140 | Dockerfile
141 | 
142 | # ruff
143 | .ruff_cache
144 | 
145 | # example
146 | examples/hpo/samplers/coco/exdata
147 | examples/hpo/samplers/coco/*/optuna_csv
148 | examples/hpo/samplers/coco/*/step_csv
149 | 
150 | # pixi environments
151 | .pixi/*
152 | !.pixi/config.toml
153 | 


--------------------------------------------------------------------------------
/examples/hpo/basic/README.md:
--------------------------------------------------------------------------------
 1 | # Basic usage of aiaccel-hpo
 2 | ## Getting started
 3 | ```bash
 4 | aiaccel-hpo optimize params.x1=[0,1] params.x2=[0,1] n_trials=100 -- python objective.py --x1={x1} --x2={x2} {out_filename}
 5 | ```
 6 | A workspace `aiaccel-hpo_***` will be created, and you will get outputs something like:
 7 | ```plain
 8 | [I 2025-08-11 23:19:09,865] A new study created in RDB with name: aiaccel-hpo
 9 | [I 2025-08-11 23:19:10,159] Trial 3 finished with value: 1.199387 and parameters: {'x1': 0.7651250790017732, 'x2': 0.7835626174783031}. Best is trial 3 with value: 1.199387.
10 | [I 2025-08-11 23:19:10,179] Trial 7 finished with value: 0.13314 and parameters: {'x1': 0.28734908107070123, 'x2': 0.22487902368959145}. Best is trial 7 with value: 0.13314.
11 | [I 2025-08-11 23:19:10,190] Trial 6 finished with value: 0.854472 and parameters: {'x1': 0.5282599103748785, 'x2': 0.7585598197415366}. Best is trial 7 with value: 0.13314.
12 | [I 2025-08-11 23:19:10,202] Trial 1 finished with value: 0.241872 and parameters: {'x1': 0.490180501594382, 'x2': 0.03993315068224257}. Best is trial 7 with value: 0.13314.
13 | [I 2025-08-11 23:19:10,215] Trial 0 finished with value: 0.267713 and parameters: {'x1': 0.1980697319379605, 'x2': 0.4779975949100864}. Best is trial 7 with value: 0.13314.
14 | [I 2025-08-11 23:19:10,225] Trial 4 finished with value: 0.223939 and parameters: {'x1': 0.42026494162838846, 'x2': 0.2175229115138555}. Best is trial 7 with value: 0.13314.
15 | [I 2025-08-11 23:19:10,238] Trial 2 finished with value: 1.099494 and parameters: {'x1': 0.8241694903158984, 'x2': 0.6482584131495605}. Best is trial 7 with value: 0.13314.
16 | [I 2025-08-11 23:19:10,249] Trial 5 finished with value: 0.489583 and parameters: {'x1': 0.6165836750706742, 'x2': 0.33076842412638574}. Best is trial 7 with value: 0.13314.
17 | [I 2025-08-11 23:19:10,259] Trial 8 finished with value: 1.624779 and parameters: {'x1': 0.9942998803639703, 'x2': 0.7975879798801359}. Best is trial 7 with value: 0.13314.
18 | [I 2025-08-11 23:19:10,273] Trial 9 finished with value: 1.497936 and parameters: {'x1': 0.9652436480683448, 'x2': 0.7524894798074183}. Best is trial 7 with value: 0.13314.
19 | ```
20 | 
21 | You can also run the optimization by specifying a config file:
22 | ```bash
23 | aiaccel-hpo optimize --config experiment/config.yaml
24 | ```
25 | 
26 | In this case, `experiment/` is used as a workspace.
27 | 
28 | You can also combine `aiaccel-hpo` with `aiaccel-job` as:
29 | ```bash
30 | aiaccel-hpo optimize params.x1=[0,1] params.x2=[0,1] n_trials=100 n_max_jobs=10 -- \
31 |     aiaccel-job local cpu {config.working_directory}/{job_name}.log -- \
32 |         python objective.py --x1={x1} --x2={x2} {out_filename}
33 | ```


--------------------------------------------------------------------------------
/docs/source/user_guide/torch.rst:
--------------------------------------------------------------------------------
 1 | ##########################
 2 |  Training a PyTorch Model
 3 | ##########################
 4 | 
 5 | *****************
 6 |  Getting Started
 7 | *****************
 8 | 
 9 | Aiaccel-based training is a wrapper of PyTorch Lightning, which can be executed as
10 | follows:
11 | 
12 | .. code-block:: bash
13 | 
14 |     python -m aiaccel.torch.apps.train config.yaml
15 | 
16 | The config file `config.yaml` typically consists of `trainer`, `datamodule`, and `task`
17 | as follows:
18 | 
19 | .. code-block:: yaml
20 |     :caption: config.yaml
21 |     :linenos:
22 | 
23 |      _base_: ${resolve_pkg_path:aiaccel.torch.apps.config}/train_base.yaml
24 | 
25 |      trainer:
26 |        max_epochs: 10
27 | 
28 |        callbacks:
29 |          - _target_: lightning.pytorch.callbacks.ModelCheckpoint
30 |            filename: "{epoch:04d}"
31 |            save_last: True
32 |            save_top_k: -1
33 | 
34 |      datamodule:
35 |        _target_: aiaccel.torch.lightning.datamodules.SingleDataModule
36 | 
37 |        train_dataset_fn:
38 |          _partial_: True
39 |          _target_: torchvision.datasets.MNIST
40 | 
41 |          root: "./dataset"
42 |          train: True
43 |          download: True
44 | 
45 |          transform:
46 |            _target_: torchvision.transforms.Compose
47 |            transforms:
48 |              - _target_: torchvision.transforms.Resize
49 |                size: [[256, 256]]
50 |              - _target_: torchvision.transforms.Grayscale
51 |                num_output_channels: 3
52 |              - _target_: torchvision.transforms.ToTensor
53 |              - _target_: torchvision.transforms.Normalize
54 |                mean: [0.5]
55 |                std: [0.5]
56 | 
57 |        val_dataset_fn:
58 |          _partial_: True
59 |          _inherit_: ${datamodule.train_dataset_fn}
60 | 
61 |          train: False
62 | 
63 |        batch_size: 128
64 |        wrap_scatter_dataset: False
65 | 
66 |      task:
67 |        _target_: my_task.MyTask
68 |        num_classes: 10
69 | 
70 |        model:
71 |          _target_: torchvision.models.resnet50
72 |          weights:
73 |            _target_: hydra.utils.get_object
74 |            path: torchvision.models.ResNet50_Weights.DEFAULT
75 | 
76 |        optimizer_config:
77 |          _target_: aiaccel.torch.lightning.OptimizerConfig
78 |          optimizer_generator:
79 |            _partial_: True
80 |            _target_: torch.optim.Adam
81 |            lr: 1.e-4
82 | 
83 | **********************
84 |  Distributed Training
85 | **********************
86 | 
87 | WIP...
88 | 
89 | *****************
90 |  Other Utilities
91 | *****************
92 | 
93 | Other utilities are listed in :doc:`API Reference <../api_reference/torch>`.
94 | 


--------------------------------------------------------------------------------
/examples/torch/image_classification/recipes/resnet50.cifar10/config.yaml:
--------------------------------------------------------------------------------
 1 | trainer:
 2 |   _target_: lightning.Trainer
 3 |   default_root_dir: ${working_directory}
 4 | 
 5 |   max_epochs: 200
 6 |   precision: bf16-mixed
 7 | 
 8 |   devices: 1
 9 |   
10 |   logger:
11 |     _target_: lightning.pytorch.loggers.CSVLogger
12 |     save_dir: ${working_directory}
13 |     name: ""
14 |     prefix: ""    
15 | 
16 | 
17 |   callbacks:
18 |     - _target_: lightning.pytorch.callbacks.ModelCheckpoint
19 |       save_top_k: 1
20 |       monitor: "validation/accuracy"
21 |       mode: "max"
22 |       filename: "epoch={epoch:04d}_score={validation/accuracy:.2f}"
23 |       
24 |     - _target_: lightning.pytorch.callbacks.LearningRateMonitor
25 |       logging_interval: "step"
26 | 
27 |     - _target_: lightning.pytorch.callbacks.RichProgressBar
28 |       refresh_rate: 5
29 |     - _target_: lightning.pytorch.callbacks.RichModelSummary
30 |       max_depth: 3
31 | 
32 | 
33 | datamodule:
34 |   _target_: aiaccel.torch.lightning.datamodules.SingleDataModule
35 | 
36 |   train_dataset_fn:
37 |     _partial_: true
38 |     _target_: torchvision.datasets.CIFAR10
39 |     train: True
40 |     transform:
41 |       _target_: torchvision.transforms.Compose
42 |       transforms:
43 |         - _target_: torchvision.transforms.RandomCrop
44 |           size: 32
45 |           padding: 4
46 |         - _target_: torchvision.transforms.RandomHorizontalFlip
47 |         - _target_: torchvision.transforms.ToTensor
48 |         - _target_: torchvision.transforms.Normalize
49 |           mean: [0.4914, 0.4822, 0.4465]
50 |           std: [0.2023, 0.1994, 0.2010]
51 | 
52 |   val_dataset_fn:
53 |     _partial_: true
54 |     _target_: torchvision.datasets.CIFAR10
55 |     train: False
56 |     transform:
57 |       _target_: torchvision.transforms.Compose
58 |       transforms:
59 |         - _target_: torchvision.transforms.ToTensor
60 |         - _target_: torchvision.transforms.Normalize
61 |           mean: [0.4914, 0.4822, 0.4465]
62 |           std: [0.2023, 0.1994, 0.2010]
63 | 
64 |   common_args:
65 |     root: "./data"
66 |     download: True
67 | 
68 |   use_scatter: False
69 |   batch_size: 256
70 |   num_workers: 24  
71 | 
72 | task:
73 |   _target_: image_classification.task.ImageClassificationTask
74 |   num_classes: 10
75 | 
76 |   model:
77 |     _target_: image_classification.small_resnet50.SmallResNet50
78 |     num_classes: 10
79 | 
80 |   optimizer_config:
81 |     _target_: aiaccel.torch.lightning.OptimizerConfig
82 |     optimizer_generator:
83 |       _partial_: True
84 |       _target_: torch.optim.Adam
85 |       lr: 1.e-3
86 |     scheduler_generator:
87 |       _partial_: True
88 |       _target_: torch.optim.lr_scheduler.CosineAnnealingLR
89 |       T_max: 200
90 | 
91 |     scheduler_interval: epoch
92 | 


--------------------------------------------------------------------------------
/aiaccel/job/apps/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | from typing import cast
 5 | 
 6 | from argparse import ArgumentParser, _SubParsersAction
 7 | from importlib import resources
 8 | import os
 9 | from pathlib import Path
10 | 
11 | from omegaconf import DictConfig
12 | 
13 | from aiaccel.config import prepare_config, print_config, setup_omegaconf
14 | 
15 | setup_omegaconf()
16 | 
17 | 
18 | def prepare_argument_parser(
19 |     default_config_name: str,
20 | ) -> tuple[DictConfig, ArgumentParser, _SubParsersAction]:  # type: ignore
21 |     parser = ArgumentParser(add_help=False)
22 |     parser.add_argument("--print_config", action="store_true")
23 |     parser.add_argument("--config", type=Path, default=None)
24 |     args, _ = parser.parse_known_args()
25 | 
26 |     args.config = Path(
27 |         args.config
28 |         or os.environ.get("AIACCEL_JOB_CONFIG")
29 |         or (Path(str(resources.files(__package__) / "config")) / default_config_name)
30 |     )  # type: ignore
31 | 
32 |     config = cast(DictConfig, prepare_config(args.config))
33 | 
34 |     if args.print_config:
35 |         print_config(config)
36 | 
37 |     parser = ArgumentParser()
38 |     parser.add_argument("--print_config", action="store_true")
39 |     parser.add_argument("--config", type=Path)
40 |     sub_parsers = parser.add_subparsers(dest="mode", required=True)
41 | 
42 |     parent_parser = ArgumentParser(add_help=False)
43 |     parent_parser.add_argument("--walltime", type=str, default=config.walltime)
44 |     parent_parser.add_argument("log_filename", type=Path)
45 |     parent_parser.add_argument("command", nargs="+")
46 | 
47 |     sub_parser = sub_parsers.add_parser("cpu", parents=[parent_parser])
48 |     sub_parser.add_argument("--n_tasks", type=int)
49 |     sub_parser.add_argument("--n_tasks_per_proc", type=int, default=config["cpu-array"].n_tasks_per_proc)
50 |     sub_parser.add_argument("--n_procs", type=int, default=config["cpu-array"].n_procs)
51 | 
52 |     sub_parser = sub_parsers.add_parser("gpu", parents=[parent_parser])
53 |     sub_parser.add_argument("--n_tasks", type=int)
54 |     sub_parser.add_argument("--n_tasks_per_proc", type=int, default=config["gpu-array"].n_tasks_per_proc)
55 |     sub_parser.add_argument("--n_procs", type=int, default=config["gpu-array"].n_procs)
56 | 
57 |     sub_parser = sub_parsers.add_parser("mpi", parents=[parent_parser])
58 |     sub_parser.add_argument("--n_procs", type=int, required=True)
59 |     sub_parser.add_argument("--n_nodes", type=int, default=config["mpi"].n_nodes)
60 | 
61 |     sub_parser = sub_parsers.add_parser("train", parents=[parent_parser])
62 |     sub_parser.add_argument("--n_gpus", type=int)
63 | 
64 |     return config, parser, sub_parsers
65 | 


--------------------------------------------------------------------------------
/docs/source/contribution_guide/coding_styles.md:
--------------------------------------------------------------------------------
 1 | (coding-style)=
 2 | # Coding styles
 3 | 
 4 | ## Basic rules
 5 | 
 6 | - Write source codes for aiaccel in Python.
 7 | - Coding style should follow PEP8.
 8 |   - Validate the coding style by using pycodestyle and flake8 in aiaccel.
 9 |   - See also Docstrings below.
10 | - Write type hints whenever possible, but there is no type hint validation in aiaccel.
11 |   - When using a built-in, e.g. `list` as a type hint, run future-import to support Python 3.8 in aiaccel.
12 | - Use [`numpy.random.RandomState`](https://numpy.org/doc/1.16/reference/generated/numpy.random.RandomState.html) to generate a random value and maintain the compatibility with [Optuna](https://github.com/optuna/optuna) used by aiaccel.
13 | 
14 | ## Docstrings
15 | 
16 | Basically, write docstrings in accordance with the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings). However, please note the following exceptions.
17 | 
18 | - Docstrings for each module are not necessarily required.
19 | - In `Args:` section, a parameter type is described in parentheses after a parameter name.
20 | - Add `Example:` section as needed.
21 | - Include the docstring of `__init__` method in a docstring of class. Do not write it under `__init__`.
22 | - Use sphinx-style links to Python objects.
23 | - Using VSCode as an editor, [autoDocstring](https://marketplace.visualstudio.com/items?itemName=njpwerner.autodocstring) is useful for generating docstrings.
24 | 
25 | ### Example
26 | 
27 | ```python
28 | class ExampleClass:
29 |     """Summary of class.
30 | 
31 |     There can be additional description(s) of this class.
32 | 
33 |     Args:
34 |         param1 (type_of_param1): Description of `param1` which
35 |             is given when __init__ method is called.
36 |         param2 (type_of_param2): Description of `param2`.
37 | 
38 |     Attributions:
39 |         param1 (type_of_param1): Description of `param1`.
40 |         param2 (type_of_param2): Description of `param2`.
41 |         param3 (type_of_param3): Description of 'param3`.
42 |     """
43 | 
44 |     def __init__(self, param1: type_of_param1, param2: type_of_param2):
45 |         self.param1 = param1
46 |         self.param2 = param2
47 |         self.param3 = generate_param3()
48 | 
49 |     def method(self, arg1: type_of_arg1) -> type_of_return:
50 |         """Recieves `type_of_arg1` object and returns return_of_method.
51 | 
52 |         Args:
53 |             arg1 (type_of_arg1): Description of `arg1`.
54 | 
55 |         Returns:
56 |             type_of_return: Description of return value. If this method
57 |             returns nothing, this section can be omitted.
58 | 
59 |         Raise:
60 |             TypeOfException: Description of Exception.
61 | 
62 |         """
63 |         ...
64 |         return return_of_method
65 | 
66 | ```
67 | 


--------------------------------------------------------------------------------
/aiaccel/torch/lightning/ckpt.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | from typing import Any
 5 | 
 6 | import logging
 7 | from pathlib import Path
 8 | 
 9 | from hydra.utils import instantiate
10 | from omegaconf import DictConfig, ListConfig
11 | 
12 | from torch import nn
13 | 
14 | from huggingface_hub import snapshot_download
15 | 
16 | from aiaccel.config import prepare_config
17 | 
18 | logger = logging.getLogger(__name__)
19 | 
20 | 
21 | def load_checkpoint(
22 |     model_path: str | Path,
23 |     config_name: str = "merged_config.yaml",
24 |     device: str = "cuda",
25 |     overwrite_config: DictConfig | ListConfig | dict[Any, Any] | list[Any] | None = None,
26 | ) -> tuple[nn.Module, DictConfig | ListConfig]:
27 |     """
28 |     Load a PyTorch Lightning model from a pre-trained checkpoint.
29 | 
30 |     This function loads a model from a specified path, which can be a local directory
31 |     or a Hugging Face repository. It also loads the associated configuration file and
32 |     allows for optional configuration overrides. The model can be set to evaluation mode
33 |     if specified.
34 | 
35 |     Args:
36 |         model_path (str | Path): The path to the model directory or Hugging Face repo.
37 |             For local paths, use the format "file://<absolute_path>" or just the path (str | Path).
38 |             For Hugging Face, use the format "hf://<repo_id>".
39 |         config_name (str): The name of the configuration file to load. Default is "merged_config.yaml".
40 |         device (str): The device to map the model to. Default is "cuda".
41 |         overwrite_config (DictConfig | ListConfig | dict | list | None): Optional configuration overrides.
42 |     """
43 | 
44 |     if isinstance(model_path, str):
45 |         if model_path.startswith("hf://"):
46 |             logger.info("Downloading model from Hugging Face...")
47 |             hf_path = model_path.removeprefix("hf://")
48 |             model_path = Path(snapshot_download(hf_path))
49 |         elif model_path.startswith("file://"):
50 |             model_path = Path(model_path.removeprefix("file://"))
51 |         else:
52 |             model_path = Path(model_path)
53 | 
54 |     config_path = model_path / config_name
55 |     config = prepare_config(config_path, overwrite_config=overwrite_config)
56 | 
57 |     checkpoint_filename = config.checkpoint_filename if "checkpoint_filename" in config else "last.ckpt"
58 |     checkpoint_path = model_path / "checkpoints" / checkpoint_filename
59 | 
60 |     logger.info(f"Loading model from {checkpoint_path}...")
61 | 
62 |     config.task._target_ += ".load_from_checkpoint"
63 |     model = instantiate(
64 |         config.task,
65 |         checkpoint_path=checkpoint_path,
66 |         map_location=device,
67 |     )
68 | 
69 |     return model, config
70 | 


--------------------------------------------------------------------------------
/docs/source/contribution_guide/pull_requests.md:
--------------------------------------------------------------------------------
 1 | # Pull Requests
 2 | 
 3 | When you want to the modified code to be reflected in the repository, please execute a pull request.
 4 | 
 5 | ## Procedures
 6 | 
 7 | - Please fork aiaccel repository on GitHub.
 8 | - After forking, run `git clone` command for aiaccel repository.
 9 | 
10 | ~~~bash
11 | git clone https://github.com/[YOUR USERNAME]/aiaccel.git
12 | ~~~
13 | 
14 | ## Developments
15 | - Update a local repository to the latest version.
16 | 
17 | ~~~bash
18 | git checkout main
19 | git pull upstream main
20 | ~~~
21 | 
22 | - Make a branch.
23 | 
24 | ~~~bash
25 | git checkout -b feature/add-new-feature
26 | ~~~
27 | 
28 | - Commit on local by using `git add` and `git commit` command as you progress.
29 | 
30 |   - The commit message describes the motivation for the change, the nature of the bug, or details the enhancement.
31 |   - The message should be written in such a way that their contents can be  understood without refering code.
32 | 
33 | 
34 | ## Submitting
35 | 
36 | Before submitting a pull request, confirm the following:
37 | - Did you discuss it with other developer on issues in advance?
38 | - Can it be distributed under the MIT licence?
39 | - Is there appropriate [unit tests](#test)?
40 | - Can the [unit tests](#test) be run on local?
41 | - Does the public function have a docstring?
42 | - Can the [documentation](#documentation-wip) be rendered correctly?
43 | - Is the [coding style](#coding-style) appropriate?
44 | - Is the commit message appropriate?
45 | - For larger commit, please provide the example (docs/source/examples) and the description of module level.
46 | - If you are adding complied codes, have you modified setup.py?
47 | 
48 | After confirming above, do following:
49 | - Push changes to the fork on GitHub.
50 | 
51 | ~~~bash
52 | git push origin feature/add-new-feature
53 | ~~~
54 | 
55 | - Enter your GitHub username and password.
56 | - Move to the GitHub web page and write the title and message, noting the following.
57 |   - Title
58 |     - Briefly describe the changes.
59 |     - Codes should be enclosed in backquotes.
60 |     - Do not end with a period.
61 |   - Descriptions
62 |     - Write the motivation.
63 |     - Write the changes.
64 |     - If the related issues can be closed, please close it with `Close #N`.
65 |     - If work-in-progress, write the remaining tasks.
66 | - Submit the pull request.
67 | 
68 | # Review processes
69 | 
70 | - Other developers can contribute comments to improve implementations, documents, and coding styles in the pull request.
71 | - When updating codes in the pull request, please commit the changes in the local repository and push the changes to the fork only if they have been successfully tested in the local environment.
72 | - If the pull request has been reviewed and approved by at least one member of the aiaccel development team, it will be merged into the main branch.
73 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | # Configuration file for the Sphinx documentation builder.
 5 | # Full options: https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | from pathlib import Path
 8 | import sys
 9 | 
10 | import aiaccel
11 | 
12 | # -- Path setup --------------------------------------------------------------
13 | root_path = Path(__file__).parent.parent.parent
14 | sys.path.insert(0, str(root_path.absolute()))
15 | 
16 | # -- Project information -----------------------------------------------------
17 | project = "aiaccel"
18 | author = "National Institute of Advanced Industrial Science And Technology (AIST)"
19 | project_copyright = author
20 | release = aiaccel.__version__
21 | 
22 | html_logo = f"{root_path}/docs/image/logo_aiaccel.png"
23 | html_favicon = f"{root_path}/docs/image/favicon.ico"
24 | 
25 | # -- General configuration ---------------------------------------------------
26 | extensions = [
27 |     "sphinx.ext.autosummary",
28 |     "sphinx.ext.autodoc",
29 |     "sphinx.ext.githubpages",
30 |     "sphinx.ext.napoleon",
31 |     "sphinx.ext.todo",
32 |     "sphinx.ext.viewcode",
33 |     "sphinx.ext.doctest",
34 |     "sphinx_design",
35 |     "sphinx_copybutton",
36 |     "myst_parser",
37 | ]
38 | 
39 | templates_path = ["_templates"]
40 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
41 | 
42 | source_suffix = {
43 |     ".rst": "restructuredtext",
44 |     ".txt": "markdown",
45 |     ".md": "markdown",
46 | }
47 | 
48 | language = "en"
49 | 
50 | # -- HTML output -------------------------------------------------------------
51 | html_theme = "shibuya"
52 | html_show_sourcelink = False
53 | html_show_sphinx = False
54 | 
55 | html_static_path = ["_static"]
56 | html_theme_options = {
57 |     "accent_color": "gold",
58 |     "nav_links": [
59 |         {"title": "User Guide", "url": "user_guide/index"},
60 |         {
61 |             "title": "API Reference",
62 |             "url": "api_reference/index",
63 |             "children": [
64 |                 {
65 |                     "title": "OmegaConf Utilities",
66 |                     "url": "api_reference/config",
67 |                     "summary": "aiaccel.config",
68 |                 },
69 |                 {
70 |                     "title": "PyTorch/Lightning Toolkit",
71 |                     "url": "api_reference/torch",
72 |                     "summary": "aiaccel.torch",
73 |                 },
74 |                 {
75 |                     "title": "Hyperparameter Optimization",
76 |                     "url": "api_reference/hpo",
77 |                     "summary": "aiaccel.hpo",
78 |                 },
79 |             ],
80 |         },
81 |         {"title": "Contribution Guide", "url": "contribution_guide/index"},
82 |     ],
83 |     "github_url": "https://github.com/aistairc/aiaccel",
84 |     "globaltoc_expand_depth": 1,
85 | }
86 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi-publish.yaml:
--------------------------------------------------------------------------------
 1 | name: Upload aiaccel package to PyPI
 2 | 
 3 | on:
 4 |   schedule:
 5 |   - cron: '0 0 1 * *'
 6 |   workflow_dispatch:
 7 | 
 8 | concurrency:
 9 |   group: pypi-publish
10 |   cancel-in-progress: false
11 | 
12 | jobs:
13 |   check_commits:
14 |     runs-on: ubuntu-latest
15 |     outputs:
16 |       has_changes: ${{ steps.check_commits.outputs.has_changes }}
17 |       version: ${{ steps.get_version.outputs.version }}
18 |       tag: ${{ steps.get_version.outputs.tag }}
19 | 
20 |     steps:
21 |       - uses: actions/checkout@v4
22 |         with:
23 |           fetch-depth: 0
24 |           fetch-tags:  true
25 | 
26 |       - name: Check for commits since last tag
27 |         id: check_commits
28 |         run: |
29 |           LAST_TAG=$(git tag --list 'v*' --sort=-v:refname | head -n 1)
30 |           echo "Last tag: ${LAST_TAG:-<none>}"
31 |           if [ -z "$LAST_TAG" ] || git log "${LAST_TAG}.." --pretty=oneline | grep .; then
32 |             echo "has_changes=true" >> $GITHUB_OUTPUT
33 |           else
34 |             echo "has_changes=false" >> $GITHUB_OUTPUT
35 |           fi
36 | 
37 |       - name: Get current [year].[month](.[patch]) version
38 |         id: get_version
39 |         run: |
40 |           BASE=$(date '+%Y.%-m')
41 |           LAST=$(git tag -l "v${BASE}*" --sort=-v:refname | head -n1)
42 | 
43 |           case "$LAST" in
44 |             "")         VERSION=$BASE.0 ;;
45 |             *)          VERSION=$BASE.$(( ${LAST##*.}+1 )) ;;
46 |           esac
47 | 
48 |           echo "version=$VERSION" >>"$GITHUB_OUTPUT"
49 |           echo "tag=v$VERSION"   >>"$GITHUB_OUTPUT"
50 | 
51 |   publish-if-needed:
52 |     needs: check_commits
53 |     if: needs.check_commits.outputs.has_changes == 'true'
54 |     runs-on: ubuntu-latest
55 |     env:
56 |       PIP_INDEX_URL: https://download.pytorch.org/whl/cpu
57 |       PIP_EXTRA_INDEX_URL: https://pypi.org/simple
58 |     permissions:
59 |       contents: write
60 |       id-token: write
61 | 
62 |     steps:
63 |       - uses: actions/checkout@v4
64 |         with:
65 |           fetch-depth: 0
66 |           fetch-tags:  true
67 | 
68 |       - name: Set up Python
69 |         uses: actions/setup-python@v5
70 |         with:
71 |           cache: 'pip'
72 |           cache-dependency-path: pyproject.toml
73 | 
74 |       - name: Install dependencies
75 |         run: |
76 |           pip install -e .[dev,github-actions]
77 | 
78 |       - name: Update and push tag
79 |         run: |
80 |           git config user.name "github-actions[bot]"
81 |           git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
82 |           git tag ${{ needs.check_commits.outputs.tag }}
83 |           git push origin ${{ needs.check_commits.outputs.tag }}
84 | 
85 |       - name: Build package
86 |         run: |
87 |           hatch build
88 | 
89 |       - name: Publish to PyPI
90 |         uses: pypa/gh-action-pypi-publish@release/v1
91 | 
92 |       - name: Create GitHub Release
93 |         env:
94 |           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
95 |         run: |
96 |           gh release create ${{ needs.check_commits.outputs.tag }} \
97 |             --title ${{ needs.check_commits.outputs.tag }} \
98 |             --notes "Release created automatically by GitHub Actions"
99 | 


--------------------------------------------------------------------------------
/aiaccel/job/apps/slurm.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | 
 4 | import os
 5 | from pathlib import Path
 6 | import shlex
 7 | import subprocess
 8 | import time
 9 | 
10 | from aiaccel.job.apps import prepare_argument_parser
11 | 
12 | 
13 | def main() -> None:
14 |     # Load configuration (from the default YAML string)
15 |     config, parser, sub_parsers = prepare_argument_parser("slurm.yaml")
16 | 
17 |     args = parser.parse_args()
18 |     mode = args.mode + "-array" if getattr(args, "n_tasks", None) is not None else args.mode
19 | 
20 |     # Prepare the job script and arguments
21 |     job = config[mode].job.format(command=shlex.join(args.command), args=args)
22 | 
23 |     if mode in ["cpu-array", "gpu-array"]:
24 |         job = f"""\
25 | for LOCAL_PROC_INDEX in {{1..{args.n_procs}}}; do
26 |     TASK_INDEX=$(( SLURM_ARRAY_TASK_ID + {args.n_tasks_per_proc} * (LOCAL_PROC_INDEX - 1) ))
27 | 
28 |     if [[ $TASK_INDEX -gt {args.n_tasks} ]]; then
29 |         break
30 |     fi
31 | 
32 |     TASK_INDEX=$TASK_INDEX \\
33 |     TASK_STEPSIZE={args.n_tasks_per_proc} \\
34 |         {job} > {args.log_filename.with_suffix("")}.${{SLURM_ARRAY_TASK_ID}}-${{LOCAL_PROC_INDEX}}.log 2>&1 &
35 | 
36 |     pids[$LOCAL_PROC_INDEX]=$!
37 | done
38 | 
39 | for i in "${{!pids[@]}}"; do
40 |     wait ${{pids[$i]}}
41 | done
42 | """
43 |         job_log_filename = args.log_filename.with_suffix(".%a.log")
44 |         job_status_filename: Path = args.log_filename.with_suffix(".${SLURM_ARRAY_TASK_ID}.out")
45 | 
46 |         status_filename_list = []
47 |         for array_idx in range(0, args.n_tasks, args.n_tasks_per_proc * args.n_procs):
48 |             status_filename_list.append(args.log_filename.with_suffix(f".{array_idx + 1}.out"))
49 |     else:
50 |         job_log_filename = args.log_filename.resolve()
51 |         job_status_filename = args.log_filename.with_suffix(".out").resolve()
52 | 
53 |         status_filename_list = [job_status_filename]
54 | 
55 |     job_script = f"""\
56 | #! /bin/bash
57 | #SBATCH -o {job_log_filename}
58 | #SBATCH -t {args.walltime}
59 | 
60 | set -eE -o pipefail
61 | trap 'echo $? > {job_status_filename}' ERR EXIT  # at error and exit
62 | 
63 | 
64 | {config.script_prologue}
65 | 
66 | {job}
67 | """
68 | 
69 |     sbatch = config.sbatch.format(args=args)
70 |     sbatch_args = config[mode].sbatch_args.format(args=args)
71 | 
72 |     # Create the job script file, remove old status files, and run the job
73 |     args.log_filename.parent.mkdir(exist_ok=True, parents=True)
74 | 
75 |     job_filename: Path = args.log_filename.with_suffix(".sh")
76 |     with open(job_filename, "w") as f:
77 |         f.write(job_script)
78 | 
79 |     for status_filename in status_filename_list:
80 |         status_filename.unlink(missing_ok=True)
81 | 
82 |     subprocess.run(f"{sbatch} {sbatch_args} {job_filename}", shell=True, check=True)
83 | 
84 |     for status_filename in status_filename_list:
85 |         while not status_filename.exists():
86 |             time.sleep(1.0)
87 | 
88 |             if config.get("use_scandir", False):  # Reflesh the file system if needed
89 |                 os.scandir(status_filename.parent)
90 | 
91 |         status = int(status_filename.read_text())
92 |         if status != 0:
93 |             raise RuntimeError(f"Job failed with {status} exit code.")
94 |         status_filename.unlink()
95 | 
96 | 
97 | if __name__ == "__main__":
98 |     main()
99 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["hatchling", "hatch-vcs"]
  3 | build-backend = "hatchling.build"
  4 | 
  5 | [project]
  6 | name = "aiaccel"
  7 | description = "AIST Toolkit for Accelerating Machine Learning Research"
  8 | readme = "README.md"
  9 | requires-python = ">=3.10"
 10 | license = {text = "MIT"}
 11 | authors = [
 12 |     {name = "AIST", email = "onishi-masaki@aist.go.jp"}
 13 | ]
 14 | classifiers = [
 15 |     "License :: OSI Approved :: MIT License",
 16 | ]
 17 | dependencies = [
 18 |     "attrs",
 19 |     "numpy",
 20 |     "scipy",
 21 |     "optuna>=4.5.0",
 22 |     "omegaconf",
 23 |     "hydra-core",
 24 |     "huggingface-hub",
 25 |     "colorama",
 26 |     "lightning>=2.2.1",
 27 |     "torch>=2.2.0",
 28 |     "h5py",
 29 |     "rich",
 30 |     "tensorboard",
 31 |     "typing_extensions",
 32 |     "simpleeval"
 33 | ]
 34 | dynamic = ["version"]
 35 | 
 36 | [tool.hatch.version]
 37 | source = "vcs"
 38 | 
 39 | [project.urls]
 40 | documentation = "https://aistairc.github.io/aiaccel/"
 41 | repository = "https://github.com/aistairc/aiaccel"
 42 | 
 43 | [project.optional-dependencies]
 44 | dev = [
 45 |     "mypy",
 46 |     "myst-parser",
 47 |     "pre-commit",
 48 |     "pytest",
 49 |     "pytest-cov",
 50 |     "pytest-mock",
 51 |     "pytest-subprocess",
 52 |     "pytest-xdist",
 53 |     "ruff",
 54 |     "sphinx",
 55 |     "sphinxcontrib-jquery",
 56 |     "sphinx-autobuild",
 57 |     "sphinx-intl",
 58 |     "sphinx-fontawesome",
 59 |     "sphinx-copybutton",
 60 |     "shibuya",
 61 |     "sphinx_design",
 62 |     "types-colorama",
 63 |     "types-PyYAML",
 64 |     "undecorated",
 65 |     "pandas",
 66 |     "pandas-stubs",
 67 |     "matplotlib",
 68 |     "docstrfmt",
 69 |     "types-toml",
 70 |     "hatch",
 71 | ]
 72 | github-actions = [
 73 |     "pytest-github-actions-annotate-failures",
 74 | ]
 75 | 
 76 | [project.scripts]
 77 | aiaccel-job = "aiaccel.launcher:main"
 78 | aiaccel-config = "aiaccel.launcher:main"
 79 | aiaccel-torch = "aiaccel.launcher:main"
 80 | aiaccel-hpo = "aiaccel.launcher:main"
 81 | 
 82 | 
 83 | # ruff configurations
 84 | [tool.ruff]
 85 | line-length = 120
 86 | target-version = "py310"
 87 | fix = true
 88 | 
 89 | [tool.ruff.lint]
 90 | select = ["F", "E", "W", "UP", "B", "SIM", "I", "C", "A", "ERA", "N", "C90"]
 91 | 
 92 | [tool.ruff.lint.per-file-ignores]
 93 | "__init__.py" = ["F401"]
 94 | 
 95 | [tool.ruff.lint.isort]
 96 | force-sort-within-sections = true
 97 | 
 98 | section-order = [
 99 |     "future",
100 |     "typing",
101 |     "standard-library",
102 |     "utilities",
103 |     "datascience",
104 |     "torch",
105 |     "torch-third-party",
106 |     "third-party",
107 |     "audio",
108 |     "first-party",
109 |     "local-folder"
110 | ]
111 | 
112 | [tool.ruff.lint.isort.sections]
113 | "typing" = ["typing", "typing_extensions", "numpy.typing"]
114 | "utilities" = ["colorama", "hydra", "omegaconf", "progressbar", "rich"]
115 | "datascience" = ["numpy", "scipy", "pandas", "matplotlib", "opt_einsum", "einops"]
116 | "torch" = ["torch"]
117 | "torch-third-party" = ["torchaudio", "torchvision", "auraloss", "lightning", "einops.layers"]
118 | "audio" = ["librosa", "pypesq", "pystoi", "soundfile"]
119 | 
120 | 
121 | # pixi configurations
122 | [tool.pixi.workspace]
123 | channels = ["conda-forge"]
124 | platforms = ["linux-64"]
125 | 
126 | [tool.pixi.dependencies]
127 | python = ">=3.10,<3.11"
128 | 
129 | [tool.pixi.pypi-dependencies]
130 | aiaccel = { path = ".", editable = true }
131 | 
132 | [tool.pixi.environments]
133 | default = { features = ["dev"] }
134 | 
135 | [tool.pixi.tasks]
136 | 


--------------------------------------------------------------------------------
/aiaccel/torch/datasets/hdf5_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | from __future__ import annotations
 5 | 
 6 | from typing import Any
 7 | 
 8 | import json
 9 | from pathlib import Path
10 | import pickle as pkl
11 | 
12 | import torch
13 | from torch.utils.data import Dataset
14 | 
15 | import h5py as h5
16 | 
17 | __all__ = [
18 |     "RawHDF5Dataset",
19 |     "HDF5Dataset",
20 | ]
21 | 
22 | 
23 | class RawHDF5Dataset(Dataset[dict[str, Any]]):
24 |     """
25 |     A dataset class for reading data from HDF5 files.
26 | 
27 |     Args:
28 |         dataset_path (Union[Path, str]): The path to the HDF5 dataset file.
29 |         grp_list (Union[Path, str, List[str], None], optional): The list of groups to load from the dataset.
30 |             If None, all groups in the dataset will be loaded. If a string or Path, it should be the path to a file
31 |             containing the list of groups. If a list, it should directly specify the groups to load. Defaults to None.
32 | 
33 |     Raises:
34 |         NotImplementedError: If grp_list is of an unsupported type.
35 | 
36 |     Attributes:
37 |         dataset_path (Union[Path, str]): The path to the HDF5 dataset file.
38 |         grp_list (List[str]): The list of groups to load from the dataset.
39 |         f (Optional[h5.File]): The HDF5 file object used for reading the dataset.
40 | 
41 |     """
42 | 
43 |     def __init__(self, dataset_path: Path | str, grp_list: Path | str | list[str] | None = None) -> None:
44 |         self.dataset_path = dataset_path
45 | 
46 |         if grp_list is None:
47 |             with h5.File(self.dataset_path, "r") as f:
48 |                 self.grp_list = list(f.keys())
49 |         elif isinstance(grp_list, (str | Path)):
50 |             grp_list = Path(grp_list)
51 |             if grp_list.suffix == ".pkl":
52 |                 with open(grp_list, "rb") as f:
53 |                     self.grp_list = pkl.load(f)
54 |             elif grp_list.suffix == ".json":
55 |                 with open(grp_list) as f:
56 |                     self.grp_list = json.load(f)
57 |         elif isinstance(grp_list, list):
58 |             self.grp_list = grp_list
59 |         else:
60 |             raise NotImplementedError()
61 |         self.grp_list.sort()
62 | 
63 |         self.f: h5.File | None = None
64 | 
65 |     def __len__(self) -> int:
66 |         return len(self.grp_list)
67 | 
68 |     def __getitem__(self, index: int) -> dict[str, Any]:
69 |         if self.f is None:
70 |             self.f = h5.File(self.dataset_path, "r")
71 | 
72 |         return {k: v[:] for k, v in self.f[self.grp_list[index]].items()}  # type: ignore
73 | 
74 |     def __del__(self) -> None:
75 |         if self.f is not None:
76 |             self.f.close()
77 | 
78 | 
79 | class HDF5Dataset(RawHDF5Dataset):
80 |     """
81 |     A dataset class for loading data from an HDF5 file.
82 | 
83 |     This class extends the `RawHDF5Dataset` class and provides a convenient way to load data from an HDF5 file
84 |     and convert it into a dictionary of torch tensors.
85 | 
86 |     Args:
87 |         path (str): The path to the HDF5 file.
88 |         transform (callable, optional): A function/transform that takes in a dictionary of data and returns a
89 |             modified version. Default is None.
90 | 
91 |     Returns:
92 |         dict[str, torch.Tensor]: A dictionary containing the data loaded from the HDF5 file, where the keys are
93 |             the names of the data fields and the values are torch tensors.
94 |     """
95 | 
96 |     def __getitem__(self, index: int) -> dict[str, torch.Tensor]:
97 |         return {k: torch.as_tensor(v) for k, v in super().__getitem__(index).items()}
98 | 


--------------------------------------------------------------------------------
/aiaccel/job/apps/sge.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | 
  4 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
  5 | # SPDX-License-Identifier: MIT
  6 | 
  7 | import os
  8 | from pathlib import Path
  9 | import shlex
 10 | import subprocess
 11 | import time
 12 | 
 13 | from aiaccel.job.apps import prepare_argument_parser
 14 | 
 15 | 
 16 | def main() -> None:
 17 |     # Load configuration (from the default YAML string)
 18 |     config, parser, sub_parsers = prepare_argument_parser("sge.yaml")
 19 | 
 20 |     args = parser.parse_args()
 21 |     mode = args.mode + "-array" if getattr(args, "n_tasks", None) is not None else args.mode
 22 | 
 23 |     # Prepare the job script and arguments
 24 |     job = config[mode].job.format(command=shlex.join(args.command), args=args)
 25 | 
 26 |     if mode in ["cpu-array", "gpu-array"]:
 27 |         job = f"""\
 28 | for LOCAL_PROC_INDEX in {{1..{args.n_procs}}}; do
 29 |     TASK_INDEX=$(( SGE_TASK_ID + {args.n_tasks_per_proc} * (LOCAL_PROC_INDEX - 1) ))
 30 | 
 31 |     if [[ $TASK_INDEX -gt {args.n_tasks} ]]; then
 32 |         break
 33 |     fi
 34 | 
 35 |     TASK_INDEX=$TASK_INDEX \\
 36 |     TASK_STEPSIZE={args.n_tasks_per_proc} \\
 37 |         {job} > {args.log_filename.with_suffix("")}.${{SGE_TASK_ID}}-${{LOCAL_PROC_INDEX}}.log 2>&1 &
 38 | 
 39 |     pids[$LOCAL_PROC_INDEX]=$!
 40 | done
 41 | 
 42 | for i in "${{!pids[@]}}"; do
 43 |     wait ${{pids[$i]}}
 44 | done
 45 | """
 46 |         job_log_filename = args.log_filename.with_suffix(".$TASK_ID.log")
 47 |         job_status_filename: Path = args.log_filename.with_suffix(".${SGE_TASK_ID}.out")
 48 | 
 49 |         status_filename_list = []
 50 |         for array_idx in range(0, args.n_tasks, args.n_tasks_per_proc * args.n_procs):
 51 |             status_filename_list.append(args.log_filename.with_suffix(f".{array_idx + 1}.out"))
 52 |     else:
 53 |         job_log_filename = args.log_filename
 54 |         job_status_filename = args.log_filename.with_suffix(".out")
 55 | 
 56 |         status_filename_list = [job_status_filename]
 57 | 
 58 |     job_script = f"""\
 59 | #! /bin/bash
 60 | 
 61 | #$-j y
 62 | #$-cwd
 63 | #$-o {job_log_filename}
 64 | 
 65 | set -eE -o pipefail
 66 | trap 'echo $? > {job_status_filename}' ERR EXIT  # at error and exit
 67 | trap 'echo 143 > {job_status_filename}' TERM  # at termination (by job scheduler)
 68 | 
 69 | if [ -n "$PBS_O_WORKDIR" ] && [ "$PBS_ENVIRONMENT" != "PBS_INTERACTIVE" ]; then
 70 |     cd $PBS_O_WORKDIR
 71 | fi
 72 | 
 73 | 
 74 | {config.script_prologue}
 75 | 
 76 | {job}
 77 | """
 78 | 
 79 |     qsub = config.qsub.format(args=args)
 80 |     qsub_args = config[mode].qsub_args.format(args=args)
 81 | 
 82 |     # Create the job script file, remove old status files, and run the job
 83 |     args.log_filename.parent.mkdir(exist_ok=True, parents=True)
 84 | 
 85 |     job_filename: Path = args.log_filename.with_suffix(".sh")
 86 |     with open(job_filename, "w") as f:
 87 |         f.write(job_script)
 88 | 
 89 |     for status_filename in status_filename_list:
 90 |         status_filename.unlink(missing_ok=True)
 91 | 
 92 |     subprocess.run(f"{qsub} {qsub_args} {job_filename}", shell=True, check=True)
 93 | 
 94 |     for status_filename in status_filename_list:
 95 |         while not status_filename.exists():
 96 |             time.sleep(1.0)
 97 |             if config.get("use_scandir", False):  # Reflesh the file system if needed
 98 |                 os.scandir(status_filename.parent)
 99 | 
100 |         status = int(status_filename.read_text())
101 |         if status != 0:
102 |             raise RuntimeError(f"Job failed with {status} exit code.")
103 |         status_filename.unlink()
104 | 
105 | 
106 | if __name__ == "__main__":
107 |     main()
108 | 


--------------------------------------------------------------------------------
/aiaccel/job/apps/pbs.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | 
  4 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
  5 | # SPDX-License-Identifier: MIT
  6 | 
  7 | import os
  8 | from pathlib import Path
  9 | import shlex
 10 | import subprocess
 11 | import time
 12 | 
 13 | from aiaccel.job.apps import prepare_argument_parser
 14 | 
 15 | 
 16 | def main() -> None:
 17 |     # Load configuration (from the default YAML string)
 18 |     config, parser, sub_parsers = prepare_argument_parser("pbs.yaml")
 19 | 
 20 |     args = parser.parse_args()
 21 |     mode = args.mode + "-array" if getattr(args, "n_tasks", None) is not None else args.mode
 22 | 
 23 |     # Prepare the job script and arguments
 24 |     job = config[mode].job.format(command=shlex.join(args.command), args=args)
 25 | 
 26 |     if mode in ["cpu-array", "gpu-array"]:
 27 |         job = f"""\
 28 | for LOCAL_PROC_INDEX in {{1..{args.n_procs}}}; do
 29 |     TASK_INDEX=$(( PBS_ARRAY_INDEX + {args.n_tasks_per_proc} * (LOCAL_PROC_INDEX - 1) ))
 30 | 
 31 |     if [[ $TASK_INDEX -gt {args.n_tasks} ]]; then
 32 |         break
 33 |     fi
 34 | 
 35 |     TASK_INDEX=$TASK_INDEX \\
 36 |     TASK_STEPSIZE={args.n_tasks_per_proc} \\
 37 |         {job} > {args.log_filename.with_suffix("")}.${{PBS_ARRAY_INDEX}}-${{LOCAL_PROC_INDEX}}.log 2>&1 &
 38 | 
 39 |     pids[$LOCAL_PROC_INDEX]=$!
 40 | done
 41 | 
 42 | for i in "${{!pids[@]}}"; do
 43 |     wait ${{pids[$i]}}
 44 | done
 45 | """
 46 |         job_log_filename = args.log_filename.with_suffix(".^array_index^.log")
 47 |         job_status_filename: Path = args.log_filename.with_suffix(".${PBS_ARRAY_INDEX}.out")
 48 | 
 49 |         status_filename_list = []
 50 |         for array_idx in range(0, args.n_tasks, args.n_tasks_per_proc * args.n_procs):
 51 |             status_filename_list.append(args.log_filename.with_suffix(f".{array_idx + 1}.out"))
 52 |     else:
 53 |         job_log_filename = args.log_filename
 54 |         job_status_filename = args.log_filename.with_suffix(".out")
 55 | 
 56 |         status_filename_list = [job_status_filename]
 57 | 
 58 |     job_script = f"""\
 59 | #! /bin/bash
 60 | 
 61 | #PBS -j oe
 62 | #PBS -k oed
 63 | #PBS -o {job_log_filename}
 64 | 
 65 | set -eE -o pipefail
 66 | trap 'echo $? > {job_status_filename}' ERR EXIT  # at error and exit
 67 | trap 'echo 143 > {job_status_filename}' TERM  # at termination (by job scheduler)
 68 | 
 69 | if [ -n "$PBS_O_WORKDIR" ] && [ "$PBS_ENVIRONMENT" != "PBS_INTERACTIVE" ]; then
 70 |     cd $PBS_O_WORKDIR
 71 | fi
 72 | 
 73 | 
 74 | {config.script_prologue}
 75 | 
 76 | {job}
 77 | """
 78 | 
 79 |     qsub = config.qsub.format(args=args)
 80 |     qsub_args = config[mode].qsub_args.format(args=args)
 81 | 
 82 |     # Create the job script file, remove old status files, and run the job
 83 |     args.log_filename.parent.mkdir(exist_ok=True, parents=True)
 84 | 
 85 |     job_filename: Path = args.log_filename.with_suffix(".sh")
 86 |     with open(job_filename, "w") as f:
 87 |         f.write(job_script)
 88 | 
 89 |     for status_filename in status_filename_list:
 90 |         status_filename.unlink(missing_ok=True)
 91 | 
 92 |     subprocess.run(f"{qsub} {qsub_args} {job_filename}", shell=True, check=True)
 93 | 
 94 |     for status_filename in status_filename_list:
 95 |         while not status_filename.exists():
 96 |             time.sleep(1.0)
 97 | 
 98 |             if config.get("use_scandir", False):  # Reflesh the file system if needed
 99 |                 os.scandir(status_filename.parent)
100 | 
101 |         status = int(status_filename.read_text())
102 |         if status != 0:
103 |             raise RuntimeError(f"Job failed with {status} exit code.")
104 |         status_filename.unlink()
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     main()
109 | 


--------------------------------------------------------------------------------
/aiaccel/torch/datasets/cached_dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
  2 | # SPDX-License-Identifier: MIT
  3 | 
  4 | from typing import Any, TypeVar
  5 | 
  6 | from multiprocessing import Manager
  7 | 
  8 | import torch
  9 | from torch.utils.data import Dataset
 10 | 
 11 | __all__ = ["CachedDataset"]
 12 | 
 13 | 
 14 | class NumpiedTensor:
 15 |     """
 16 |     A wrapper class that converts a PyTorch tensor to a NumPy array and vice versa.
 17 | 
 18 |     Args:
 19 |         tensor (torch.Tensor): The input PyTorch tensor.
 20 | 
 21 |     Attributes:
 22 |         array (np.ndarray): The NumPy array representation of the tensor.
 23 | 
 24 |     Methods:
 25 |         to_tensor: Converts the NumPy array back to a PyTorch tensor.
 26 |     """
 27 | 
 28 |     def __init__(self, tensor: torch.Tensor) -> None:
 29 |         self.array = tensor.numpy()
 30 | 
 31 |     def to_tensor(self) -> torch.Tensor:
 32 |         """
 33 |         Converts the NumPy array back to a PyTorch tensor.
 34 | 
 35 |         Returns:
 36 |             torch.Tensor: The PyTorch tensor representation of the NumPy array.
 37 |         """
 38 |         return torch.tensor(self.array)
 39 | 
 40 | 
 41 | def numpize_sample(sample: Any) -> Any:
 42 |     """
 43 |     Converts the input sample to a NumPy-compatible format.
 44 | 
 45 |     Args:
 46 |         sample (Any): The input sample to be converted.
 47 | 
 48 |     Returns:
 49 |         Any: The converted sample in a NumPy-compatible format.
 50 |     """
 51 | 
 52 |     if isinstance(sample, torch.Tensor):
 53 |         return NumpiedTensor(sample)
 54 |     elif isinstance(sample, tuple):
 55 |         return tuple(numpize_sample(s) for s in sample)
 56 |     elif isinstance(sample, list):
 57 |         return [numpize_sample(s) for s in sample]
 58 |     elif isinstance(sample, dict):
 59 |         return {k: numpize_sample(v) for k, v in sample.items()}
 60 |     else:
 61 |         return sample
 62 | 
 63 | 
 64 | def tensorize_sample(sample: Any) -> Any:
 65 |     """
 66 |     Converts the given sample into a tensor representation.
 67 | 
 68 |     Args:
 69 |         sample (Any): The input sample to be tensorized.
 70 | 
 71 |     Returns:
 72 |         Any: The tensorized representation of the input sample.
 73 |     """
 74 | 
 75 |     if isinstance(sample, NumpiedTensor):
 76 |         return sample.to_tensor()
 77 |     elif isinstance(sample, tuple):
 78 |         return tuple(tensorize_sample(s) for s in sample)
 79 |     elif isinstance(sample, list):
 80 |         return [tensorize_sample(s) for s in sample]
 81 |     elif isinstance(sample, dict):
 82 |         return {k: tensorize_sample(v) for k, v in sample.items()}
 83 |     else:
 84 |         return sample
 85 | 
 86 | 
 87 | T_co = TypeVar("T_co", covariant=True)
 88 | 
 89 | 
 90 | class CachedDataset(Dataset[T_co]):
 91 |     """
 92 |     A dataset wrapper that caches the samples to improve performance.
 93 | 
 94 |     Args:
 95 |         dataset (Dataset): The original dataset to be wrapped.
 96 | 
 97 |     Attributes:
 98 |         dataset (Dataset): The original dataset.
 99 |         manager (Manager): The multiprocessing manager.
100 |         cache (dict): The cache dictionary to store the cached samples.
101 |     """
102 | 
103 |     def __init__(self, dataset: Dataset[T_co]) -> None:
104 |         self.dataset = dataset
105 | 
106 |         self.manager = Manager()
107 |         self.cache = self.manager.dict()
108 | 
109 |     def __len__(self) -> int:
110 |         return len(self.dataset)  # type: ignore[arg-type]
111 | 
112 |     def __getitem__(self, index: int) -> Any:
113 |         if index not in self.cache:
114 |             self.cache[index] = numpize_sample(self.dataset[index])
115 | 
116 |         return tensorize_sample(self.cache[index])
117 | 


--------------------------------------------------------------------------------
/examples/hpo/benchmark/plot.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
  2 | # SPDX-License-Identifier: MIT
  3 | 
  4 | import glob
  5 | 
  6 | from matplotlib.axes._axes import Axes
  7 | import matplotlib.pyplot as plt
  8 | import numpy as np
  9 | import pandas as pd
 10 | 
 11 | 
 12 | def get_min_values(values: list[float]) -> list[float]:
 13 |     min_values = []
 14 |     min_value = np.inf
 15 |     for value in values:
 16 |         if value < min_value:
 17 |             min_value = value
 18 |         min_values.append(min_value)
 19 | 
 20 |     return min_values
 21 | 
 22 | 
 23 | def plot_dim_vs_min_value(ax: Axes, csv_names_for_dim: list[list[list[str]]], title: str) -> None:
 24 |     label_names = ["nm", "TPE-mcT-mvF", "nm+subTPE-mcT-mvF"]
 25 |     colors = ["r", "g", "b"]
 26 | 
 27 |     dim = ["2", "3", "5", "10", "20", "40"]
 28 | 
 29 |     # dim
 30 |     min_values_for_dim = []
 31 |     for csv_names_for_algorism in csv_names_for_dim:
 32 |         # nm, tpe, nm_subtpe
 33 |         min_values_for_algorism = []
 34 |         for csv_names in csv_names_for_algorism:
 35 |             min_values = []
 36 |             for csv_name in csv_names:
 37 |                 df = pd.read_csv(csv_name)
 38 | 
 39 |                 df_values = df["value - f_opt"]
 40 |                 print(df_values)
 41 |                 print(min(df_values))
 42 |                 min_values.append(min(df_values))
 43 |             min_values_for_algorism.append(min_values)
 44 |         min_values_for_dim.append(min_values_for_algorism)
 45 | 
 46 |     print(len(min_values_for_dim))
 47 | 
 48 |     for i in range(3):
 49 |         values = [item[i] for item in min_values_for_dim]
 50 |         print(values)
 51 |         values_mean = np.array(values).mean(axis=1)
 52 |         values_std = np.array(values).std(axis=1)
 53 |         ax.errorbar(
 54 |             dim,
 55 |             values_mean,
 56 |             yerr=values_std,
 57 |             capsize=5,
 58 |             markersize=10,
 59 |             ecolor=colors[i],
 60 |             markeredgecolor=colors[i],
 61 |             color=colors[i],
 62 |             label=label_names[i],
 63 |         )
 64 | 
 65 |     if title == "f6":
 66 |         ax.set_ylim(-1000, 10000)
 67 |     ax.set_title(title)
 68 |     ax.grid(axis="both")
 69 |     ax.legend(fontsize=6)
 70 | 
 71 | 
 72 | def compare_optimizer(base_dir: str = ".") -> None:
 73 |     fig, ax = plt.subplots(5, 5, figsize=(16, 20))
 74 |     if isinstance(ax, Axes):
 75 |         return
 76 | 
 77 |     for num_of_f in range(1, 25):
 78 |         result_csv_list_for_dim = []
 79 |         for num_of_dm in [2, 3, 5, 10, 20, 40]:
 80 |             result_csv_patterns = [
 81 |                 f"{base_dir}/nelder-mead/optuna_csv/optuna-nelder-mead-func_id{num_of_f}-dim{num_of_dm}-instance*/f{num_of_f}/DM{num_of_dm:02}/result_bbob_f{num_of_f:03}_i*_d{num_of_dm:02}_*_fopt.csv",
 82 |                 f"{base_dir}/TPE/optuna_csv/optuna-TPE-func_id{num_of_f}-dim{num_of_dm}-instance*/f{num_of_f}/DM{num_of_dm:02}/result_bbob_f{num_of_f:03}_i*_d{num_of_dm:02}_*_fopt.csv",
 83 |                 f"{base_dir}/nelder-mead-subTPE/optuna_csv/optuna-nelder-mead-subTPE-func_id{num_of_f}-dim{num_of_dm}-instance*/f{num_of_f}/DM{num_of_dm:02}/result_bbob_f{num_of_f:03}_i*_d{num_of_dm:02}_*_fopt.csv",
 84 |             ]
 85 | 
 86 |             result_csv_list = [sorted(glob.glob(pattern)) for pattern in result_csv_patterns]
 87 |             print(result_csv_list)
 88 |             result_csv_list_for_dim.append(result_csv_list)
 89 | 
 90 |         plot_dim_vs_min_value(
 91 |             ax[int((num_of_f - 1) / 5), int((num_of_f - 1) % 5)], result_csv_list_for_dim, f"f{num_of_f}"
 92 |         )
 93 | 
 94 |     plt.savefig("result_bbob_dim_vs_value-fopt_parallel.png")
 95 |     plt.show()
 96 | 
 97 | 
 98 | if __name__ == "__main__":
 99 |     compare_optimizer()
100 | 


--------------------------------------------------------------------------------
/tests/config/test_config.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
  2 | # SPDX-License-Identifier: MIT
  3 | 
  4 | import io
  5 | from pathlib import Path
  6 | 
  7 | from omegaconf import DictConfig
  8 | from omegaconf import OmegaConf as oc  # noqa: N813
  9 | 
 10 | import pytest
 11 | 
 12 | from aiaccel.config.config import pathlib2str_config, prepare_config, print_config, resolve_inherit
 13 | 
 14 | 
 15 | def test_load_config() -> None:
 16 |     config = prepare_config(Path(__file__).parent / "test_conf.yaml")
 17 |     assert isinstance(config, DictConfig)
 18 |     del config["config_path"]
 19 |     del config["working_directory"]
 20 |     expected_config = {
 21 |         "A": [{"CC": "cc", "AA": "aa", "BB": "bb"}, {"AAA": "aaa"}],
 22 |         "B": {"AA": "dummy", "BB": "bb"},
 23 |         "C": {"CC": "cc"},
 24 |         "D": {"EE": "ee"},
 25 |         "E": {"EE": "ee"},
 26 |         "Eval": 1.5,
 27 |     }
 28 | 
 29 |     assert config == expected_config
 30 | 
 31 | 
 32 | def test_resolve_inherit() -> None:
 33 |     loaded_config = oc.create(
 34 |         {
 35 |             "A": [{"_inherit_": ["${B}", "${C}"], "AA": "aa"}, {"AAA": "aaa"}],
 36 |             "B": {"AA": "dummy", "BB": "bb"},
 37 |             "C": {"CC": "cc"},
 38 |             "D": {"_inherit_": "${E}"},
 39 |             "E": {"EE": "ee"},
 40 |         }
 41 |     )
 42 |     resolved_config = resolve_inherit(loaded_config)
 43 |     expected_config = {
 44 |         "A": [{"CC": "cc", "AA": "aa", "BB": "bb"}, {"AAA": "aaa"}],
 45 |         "B": {"AA": "dummy", "BB": "bb"},
 46 |         "C": {"CC": "cc"},
 47 |         "D": {"EE": "ee"},
 48 |         "E": {"EE": "ee"},
 49 |     }
 50 | 
 51 |     assert resolved_config == expected_config
 52 | 
 53 | 
 54 | def test_resolve_path() -> None:
 55 |     config = prepare_config(Path(__file__).parent / "test_resolve_path.yaml")
 56 | 
 57 |     assert isinstance(config, DictConfig)
 58 | 
 59 | 
 60 | def test_print_config(capfd: pytest.CaptureFixture[str]) -> None:
 61 |     conf = oc.create({"foo": {"bar": [1, 2, 3]}})
 62 |     print_config(conf)
 63 | 
 64 |     stdout, _ = capfd.readouterr()
 65 | 
 66 |     # with open(Path(__file__).parent / "test_config_assets" / "print_config.txt", "w") as f:
 67 |     #     f.write(stdout)  # noqa: ERA001
 68 | 
 69 |     with open(Path(__file__).parent / "test_config_assets" / "print_config.txt") as f:
 70 |         stdout_target = f.read()
 71 | 
 72 |     assert stdout == stdout_target
 73 | 
 74 | 
 75 | def test_pathlib2str_config() -> None:
 76 |     src_conf = oc.create({"foo": {"bar": Path("test/path")}})
 77 |     dst_conf = pathlib2str_config(src_conf)
 78 | 
 79 |     assert isinstance(dst_conf.foo.bar, str)
 80 |     assert isinstance(src_conf.foo.bar, Path)
 81 | 
 82 | 
 83 | def test_load_config_print_option(capfd: pytest.CaptureFixture[str]) -> None:
 84 |     prepare_config(
 85 |         Path(__file__).parent / "test_conf.yaml",
 86 |         print_config=True,
 87 |         print_config_kwargs={"line_length": 40},
 88 |     )
 89 | 
 90 |     stdout, _ = capfd.readouterr()
 91 |     assert "=" * 40 in stdout
 92 | 
 93 | 
 94 | def test_print_config_kwargs() -> None:
 95 |     buffer = io.StringIO()
 96 |     conf = oc.create({"foo": 1})
 97 |     print_config(conf, line_length=10, file=buffer)
 98 | 
 99 |     output = buffer.getvalue()
100 |     assert "=" * 10 in output
101 | 
102 | 
103 | def test_load_config_save_option(tmp_path: Path) -> None:
104 |     save_dir = tmp_path / "saved"
105 |     config = prepare_config(
106 |         Path(__file__).parent / "test_conf.yaml",
107 |         working_directory=tmp_path,
108 |         save_config=True,
109 |         save_directory=save_dir,
110 |         save_filename="custom.yaml",
111 |     )
112 | 
113 |     save_path = save_dir / "custom.yaml"
114 | 
115 |     assert save_path.exists()
116 |     assert isinstance(config.working_directory, str)
117 | 
118 |     reloaded_config = oc.load(save_path)
119 |     assert "config_path" in reloaded_config
120 | 


--------------------------------------------------------------------------------
/aiaccel/torch/lightning/datamodules/single_datamodule.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
  2 | # SPDX-License-Identifier: MIT
  3 | 
  4 | from typing import Any
  5 | 
  6 | from collections.abc import Callable
  7 | 
  8 | from torch.utils.data import DataLoader, Dataset
  9 | 
 10 | import lightning as lt
 11 | 
 12 | from aiaccel.torch.datasets import CachedDataset, scatter_dataset
 13 | 
 14 | 
 15 | class SingleDataModule(lt.LightningDataModule):
 16 |     """
 17 |     A PyTorch Lightning DataModule designed to handle training and validation datasets
 18 |     with support for caching and dataset scattering.
 19 | 
 20 |     Attributes:
 21 |         train_dataset_fn (Callable[..., Dataset[str]]): A callable function to create the training dataset.
 22 |         val_dataset_fn (Callable[..., Dataset[str]]): A callable function to create the validation dataset.
 23 |         batch_size (int): The batch size for the DataLoader.
 24 |         use_cache (bool): Whether to cache the datasets. Defaults to False.
 25 |         use_scatter (bool): Whether to scatter the datasets. Defaults to True.
 26 |         num_workers (int): Number of workers for the DataLoader. Defaults to 10.
 27 |         common_args (dict[str, Any] | None): Common arguments to pass to the dataset functions. Defaults to None.
 28 |     Methods:
 29 |         setup(stage: str | None) -> None:
 30 |             Prepares the datasets for training and validation. Only supports the "fit" stage.
 31 |             Raises a ValueError if the stage is not "fit".
 32 |         train_dataloader() -> DataLoader:
 33 |             Returns the DataLoader for the training dataset.
 34 |         val_dataloader() -> DataLoader:
 35 |             Returns the DataLoader for the validation dataset.
 36 |         _create_dataloader(dataset, **kwargs: Any) -> DataLoader:
 37 |             Internal method to create a DataLoader for a given dataset with specified configurations.
 38 |     """
 39 | 
 40 |     def __init__(
 41 |         self,
 42 |         train_dataset_fn: Callable[..., Dataset[str]],
 43 |         val_dataset_fn: Callable[..., Dataset[str]],
 44 |         batch_size: int,
 45 |         use_cache: bool = False,
 46 |         use_scatter: bool = True,
 47 |         num_workers: int = 10,
 48 |         common_args: dict[str, Any] | None = None,
 49 |     ):
 50 |         super().__init__()
 51 | 
 52 |         self.train_dataset_fn = train_dataset_fn
 53 |         self.val_dataset_fn = val_dataset_fn
 54 | 
 55 |         self.common_args = common_args if common_args is not None else {}
 56 | 
 57 |         self.batch_size = batch_size
 58 | 
 59 |         self.use_cache = use_cache
 60 |         self.use_scatter = use_scatter
 61 | 
 62 |         self.num_workers = num_workers
 63 | 
 64 |     def setup(self, stage: str | None) -> None:
 65 |         if stage == "fit":
 66 |             train_dataset = self.train_dataset_fn(**self.common_args)
 67 |             val_dataset = self.val_dataset_fn(**self.common_args)
 68 | 
 69 |             print(f"Dataset size: {len(train_dataset)=},  {len(val_dataset)=}")  # type: ignore
 70 | 
 71 |             if self.use_cache:
 72 |                 train_dataset = CachedDataset(train_dataset)
 73 |                 val_dataset = CachedDataset(val_dataset)
 74 | 
 75 |             if self.use_scatter:
 76 |                 train_dataset = scatter_dataset(train_dataset)
 77 |                 val_dataset = scatter_dataset(val_dataset)
 78 | 
 79 |             self.train_dataset = train_dataset
 80 |             self.val_dataset = val_dataset
 81 |         else:
 82 |             raise ValueError("`stage` is not 'fit'.")
 83 | 
 84 |     def _create_dataloader(self, dataset: Dataset[Any], **kwargs: Any) -> DataLoader[Any]:
 85 |         return DataLoader(
 86 |             dataset=dataset,
 87 |             batch_size=self.batch_size,
 88 |             num_workers=self.num_workers,
 89 |             persistent_workers=True,
 90 |             shuffle=True,
 91 |             pin_memory=True,
 92 |             **kwargs,
 93 |         )
 94 | 
 95 |     def train_dataloader(self) -> DataLoader[Any]:
 96 |         return self._create_dataloader(self.train_dataset, drop_last=True)
 97 | 
 98 |     def val_dataloader(self) -> DataLoader[Any]:
 99 |         return self._create_dataloader(self.val_dataset, drop_last=False)
100 | 


--------------------------------------------------------------------------------
/docs/source/user_guide/config.rst:
--------------------------------------------------------------------------------
  1 | #########################
  2 |  Managing Configurations
  3 | #########################
  4 | 
  5 | This guide introduces how to manage configuration files using ``aiaccel.config`` and
  6 | `Hydra's instantiation mechanism
  7 | <https://hydra.cc/docs/advanced/instantiate_objects/overview/>`_. The key features of
  8 | ``aiaccel.config`` are:
  9 | 
 10 | - Modular programming through YAML meta-programming
 11 | - Efficient management of multiple config files using ``_base_`` and ``_inherit_``
 12 |   attributes
 13 | - Easy version control integration with Git
 14 | - Minimal dependency on Hydra (only uses ``hydra.utils.instantiate``)
 15 | 
 16 | *****************
 17 |  Getting Started
 18 | *****************
 19 | 
 20 | Aiaccel's configuration system is based on `OmegaConf
 21 | <http://omegaconf.readthedocs.io/>`_. The typical usage is:
 22 | 
 23 | .. code-block:: yaml
 24 |     :caption: config.yaml
 25 | 
 26 |     model:
 27 |       _target_: torchvision.models.resnet50
 28 |       num_classes: 13
 29 | 
 30 | .. code-block:: python
 31 |     :caption: example.py
 32 | 
 33 |     from argparse import ArgumentParser
 34 | 
 35 |     from aiaccel.config import (
 36 |         prepare_config,
 37 |         print_config,
 38 |     )
 39 |     from hydra.utils import instantiate
 40 | 
 41 | 
 42 |     parser = ArgumentParser()
 43 |     parser.add_argument("config", type=str, help="Config file in YAML format")
 44 |     args, unk_args = parser.parse_known_args()
 45 | 
 46 |     config = prepare_config(args.config)
 47 |     print_config(config)
 48 | 
 49 |     model = instantiate(config.model)
 50 | 
 51 |     print(model)
 52 | 
 53 |     ...
 54 | 
 55 | To run the script:
 56 | 
 57 | .. code-block:: bash
 58 | 
 59 |     python example.py config.yaml
 60 | 
 61 | ``prepare_config`` wraps :func:`aiaccel.config.load_config`, processes the ``_base_``
 62 | attribute, resolves ``_inherit_`` entries, and returns the ready-to-use configuration
 63 | while also allowing you to forward options to :func:`load_config` via
 64 | ``load_config_kwargs``.
 65 | 
 66 | ******************************
 67 |  ``_base_`` and ``_inherit_``
 68 | ******************************
 69 | 
 70 | The ``_base_`` attribute allows you to inherit from another configuration file.
 71 | 
 72 | Example base configuration:
 73 | 
 74 | .. code-block:: yaml
 75 |     :caption: config_base.yaml
 76 | 
 77 |     params:
 78 |         _convert_: partial
 79 |         _target_: aiaccel.hpo.optuna.hparams_manager.HparamsManager
 80 |         x1: [0, 1]
 81 |         x2:
 82 |             _target_: aiaccel.hpo.optuna.hparams.Float
 83 |             low: 0.0
 84 |             high: 1.0
 85 |             log: false
 86 | 
 87 | Example configuration that uses a base:
 88 | 
 89 | .. code-block:: yaml
 90 |     :caption: config.yaml
 91 | 
 92 |     _base_: config_base.yaml
 93 |     n_trials: 100
 94 |     n_max_jobs: 4
 95 | 
 96 | ``config.yaml`` is automatically expanded to include the contents of
 97 | ```config_base.yaml``.
 98 | 
 99 | The ``_inherit_`` attribute, on the other hand, allows you to duplicate and modify parts
100 | of the configuration. Example configuration:
101 | 
102 | .. code-block:: yaml
103 |     :caption: config.yaml
104 | 
105 |     params:
106 |         _convert_: partial
107 |         _target_: aiaccel.hpo.optuna.hparams_manager.HparamsManager
108 |         x1:
109 |             _inherit_: "${param}"
110 |         x2:
111 |             _inherit_: "${param}"
112 | 
113 |     objective:
114 |         _target_: objective.main
115 | 
116 |     n_trials: 30
117 |     n_max_jobs: 4
118 | 
119 |     param:
120 |         _target_: aiaccel.hpo.optuna.hparams.Float
121 |         low: 0.0
122 |         high: 1.0
123 |         log: false
124 | 
125 | After processing, the configuration will be expanded so that ``x1`` and ``x2`` each
126 | include the contents of ``param`` along with their own ``name`` fields.
127 | 
128 | *******************
129 |  ``eval`` Resolver
130 | *******************
131 | 
132 | The ``eval`` resolver allows arithmetic operations within the config. It makes use of
133 | safe eval.
134 | 
135 | Example configuration:
136 | 
137 | .. code-block:: yaml
138 |     :caption: config.yaml
139 | 
140 |     n_trials: ${eval:"${n_max_jobs} * 10"}
141 |     n_max_jobs: 4
142 | 
143 | *********************
144 |  Version Controlling
145 | *********************
146 | 
147 | WIP
148 | 
149 | ************************
150 |  Additional Information
151 | ************************
152 | 
153 | Detailed information is available at :doc:`API Reference <../api_reference/config>`.
154 | 


--------------------------------------------------------------------------------
/tests/hpo/apps/test_optimize.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
  2 | # SPDX-License-Identifier: MIT
  3 | 
  4 | from collections.abc import Callable, Generator
  5 | from contextlib import AbstractContextManager, contextmanager
  6 | import os
  7 | from pathlib import Path
  8 | import shutil
  9 | import subprocess
 10 | 
 11 | from hydra.utils import instantiate
 12 | 
 13 | import pytest
 14 | 
 15 | from aiaccel.config import prepare_config
 16 | 
 17 | 
 18 | @pytest.fixture()
 19 | def workspace_factory(
 20 |     tmp_path_factory: pytest.TempPathFactory,
 21 | ) -> Callable[[str], AbstractContextManager[Path]]:
 22 |     @contextmanager
 23 |     def _factory(data_name: str = "single_objective") -> Generator[Path, None, None]:
 24 |         tmp_path = tmp_path_factory.mktemp("workspace")
 25 | 
 26 |         shutil.copytree(Path(__file__).parent / "data" / data_name, tmp_path, dirs_exist_ok=True)
 27 | 
 28 |         org_path = Path.cwd()
 29 | 
 30 |         try:
 31 |             os.chdir(tmp_path)
 32 |             yield tmp_path
 33 |         finally:
 34 |             os.chdir(org_path)
 35 | 
 36 |     return _factory
 37 | 
 38 | 
 39 | def test_from_config(workspace_factory: Callable[..., AbstractContextManager[Path]]) -> None:
 40 |     with workspace_factory() as workspace:
 41 |         subprocess.run("aiaccel-hpo optimize --config=config.yaml", shell=True, check=True)
 42 | 
 43 |         assert (workspace / "optuna.db").exists()
 44 | 
 45 |         config = prepare_config(workspace / "merged_config.yaml")
 46 |         study = instantiate(config.study)
 47 |         assert len(study.get_trials()) == 15
 48 | 
 49 | 
 50 | def test_from_cli(workspace_factory: Callable[..., AbstractContextManager[Path]]) -> None:
 51 |     with workspace_factory() as workspace:
 52 |         cmd = (
 53 |             "aiaccel-hpo optimize"
 54 |             " working_directory=./cli/"
 55 |             " n_trials=15"
 56 |             " n_max_jobs=1"
 57 |             " params.x1='[0,1]'"
 58 |             " params.x2='[0,1]'"
 59 |             " study.sampler._target_=optuna.samplers.TPESampler"
 60 |             " study.sampler.seed=0"
 61 |             " --"
 62 |             " python ./objective.py --x1={x1} --x2={x2} {out_filename}"
 63 |         )
 64 |         subprocess.run(cmd, shell=True, check=True)
 65 | 
 66 |         config = prepare_config(workspace / "cli" / "merged_config.yaml")
 67 |         study = instantiate(config.study)
 68 |         best_value = study.best_trial.value
 69 | 
 70 |     # check consistency with the config-style execution
 71 |     with workspace_factory() as workspace:
 72 |         subprocess.run("aiaccel-hpo optimize --config=config.yaml", shell=True, check=True)
 73 | 
 74 |         config = prepare_config(workspace / "merged_config.yaml")
 75 |         study = instantiate(config.study)
 76 | 
 77 |     assert best_value == study.best_trial.value
 78 | 
 79 | 
 80 | def test_resume(workspace_factory: Callable[..., AbstractContextManager[Path]]) -> None:
 81 |     with workspace_factory() as workspace:
 82 |         subprocess.run("aiaccel-hpo optimize --config=config.yaml", shell=True, check=True)
 83 |         subprocess.run("aiaccel-hpo optimize --config=config.yaml", shell=True, check=True)
 84 | 
 85 |         config = prepare_config(workspace / "merged_config.yaml")
 86 |         study = instantiate(config.study)
 87 |         assert len(study.get_trials()) == 30
 88 | 
 89 | 
 90 | def test_multi_objective(workspace_factory: Callable[..., AbstractContextManager[Path]]) -> None:
 91 |     with workspace_factory("multi_objective") as workspace:
 92 |         subprocess.run("aiaccel-hpo optimize --config=config.yaml", shell=True, check=True)
 93 | 
 94 |         config = prepare_config(workspace / "merged_config.yaml")
 95 |         study = instantiate(config.study)
 96 | 
 97 |         assert len(study.get_trials()) == 15
 98 | 
 99 |         assert all(len(trial.values) == 2 for trial in study.get_trials())
100 | 
101 | 
102 | def test_from_cli_and_config(workspace_factory: Callable[..., AbstractContextManager[Path]]) -> None:
103 |     with workspace_factory() as workspace:
104 |         cmd = (
105 |             "aiaccel-hpo optimize"
106 |             " n_trials=30"
107 |             " n_max_jobs=1"
108 |             " params.x1='[0,10]'"
109 |             " params.x2='[0,10]'"
110 |             " study.sampler._target_=optuna.samplers.TPESampler"
111 |             " study.sampler.seed=0"
112 |             " --config=config.yaml"
113 |             " --"
114 |         )
115 |         subprocess.run(cmd, shell=True, check=True)
116 | 
117 |         assert (workspace / "optuna.db").exists()
118 | 
119 |         config = prepare_config(workspace / "merged_config.yaml")
120 |         study = instantiate(config.study)
121 |         assert len(study.get_trials()) == 30
122 | 


--------------------------------------------------------------------------------
/aiaccel/hpo/apps/optimize.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
  2 | # SPDX-License-Identifier: MIT
  3 | 
  4 | from typing import Any
  5 | 
  6 | import argparse
  7 | from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
  8 | from datetime import datetime
  9 | from importlib import resources
 10 | import json
 11 | from pathlib import Path
 12 | import shlex
 13 | import subprocess
 14 | import sys
 15 | 
 16 | from hydra.utils import instantiate
 17 | from omegaconf import OmegaConf as oc  # noqa: N813
 18 | 
 19 | from optuna.trial import Trial
 20 | 
 21 | from aiaccel.config import pathlib2str_config, prepare_config, print_config
 22 | 
 23 | 
 24 | def main() -> None:
 25 |     # remove OmegaConf arguments from sys.argv
 26 |     oc_args = []
 27 |     if "--" in sys.argv:  # If there are additional arguments before '--', treat them as OmegaConf arguments
 28 |         sep_idx = sys.argv.index("--")
 29 |         sys.argv.pop(sep_idx)
 30 | 
 31 |         for ii in range(0, sep_idx)[::-1]:
 32 |             if "=" in sys.argv[ii] and not sys.argv[ii].startswith("-"):
 33 |                 oc_args.append(sys.argv.pop(ii))
 34 | 
 35 |         oc_args = list(reversed(oc_args))
 36 | 
 37 |     # parse arguments
 38 |     parser = argparse.ArgumentParser(
 39 |         description="""\
 40 | A helper CLI to optimize hyperparameters using Optuna.
 41 | See complete usage: https://aistairc.github.io/aiaccel/user_guide/hpo.html .
 42 | 
 43 | Typical usages:
 44 |   aiaccel-hpo optimize params.x1=[0,1] params.x2=[0,1] -- ./objective.py --x1={x1} --x2={x2} {out_filename}
 45 |   aiaccel-hpo optimize --config=config.yaml ./objective.py --x1={x1} --x2={x2} {out_filename}
 46 | """,
 47 |         formatter_class=argparse.RawTextHelpFormatter,
 48 |     )
 49 |     parser.add_argument("--config", type=Path, default=None, help="Path to the configuration file.")
 50 |     parser.add_argument("command", nargs=argparse.REMAINDER)
 51 | 
 52 |     args = parser.parse_args()
 53 | 
 54 |     # load config
 55 |     if args.config is None:
 56 |         args.config = resources.files(f"{__package__}.config") / "default.yaml"
 57 |         working_directory = Path.cwd().resolve() / f"aiaccel-hpo_{datetime.now():%Y-%m-%d-%H-%M-%S}"
 58 |     else:
 59 |         working_directory = args.config.parent.resolve()
 60 | 
 61 |     config = prepare_config(
 62 |         config_filename=args.config,
 63 |         working_directory=working_directory,
 64 |         overwrite_config=oc.from_cli(oc_args),
 65 |     )
 66 | 
 67 |     if len(args.command) > 0:
 68 |         config.command = args.command
 69 | 
 70 |     print_config(config)
 71 | 
 72 |     # save config
 73 |     config.working_directory = Path(config.working_directory)
 74 |     config.working_directory.mkdir(parents=True, exist_ok=True)
 75 | 
 76 |     with open(config.working_directory / "merged_config.yaml", "w") as f:
 77 |         oc.save(pathlib2str_config(config), f)
 78 | 
 79 |     # build study and hparams manager
 80 |     study = instantiate(config.study)
 81 |     params = instantiate(config.params)
 82 | 
 83 |     # main loop
 84 |     futures: dict[Any, tuple[Trial, Path]] = {}
 85 |     submitted_job_count = 0
 86 |     finished_job_count = 0
 87 | 
 88 |     with ThreadPoolExecutor(config.n_max_jobs) as pool:
 89 |         while finished_job_count < config.n_trials:
 90 |             active_jobs = len(futures.keys())
 91 |             available_slots = max(0, config.n_max_jobs - active_jobs)
 92 | 
 93 |             # Submit job in ThreadPoolExecutor
 94 |             for _ in range(min(available_slots, config.n_trials - submitted_job_count)):
 95 |                 trial = study.ask()
 96 | 
 97 |                 out_filename = config.working_directory / f"trial_{trial.number:0>6}.json"
 98 | 
 99 |                 future = pool.submit(
100 |                     subprocess.run,
101 |                     shlex.join(config.command).format(
102 |                         config=config,
103 |                         job_name=f"trial_{trial.number:0>6}",
104 |                         out_filename=out_filename,
105 |                         **params.suggest_hparams(trial),
106 |                     ),
107 |                     shell=True,
108 |                     check=True,
109 |                 )
110 | 
111 |                 futures[future] = trial, out_filename
112 |                 submitted_job_count += 1
113 | 
114 |             # Get result from out_filename and tell
115 |             done_features, _ = wait(futures.keys(), return_when=FIRST_COMPLETED)
116 |             for future in done_features:
117 |                 trial, out_filename = futures.pop(future)
118 | 
119 |                 with open(out_filename) as f:
120 |                     y = json.load(f)
121 | 
122 |                 out_filename.unlink()
123 | 
124 |                 frozentrial = study.tell(trial, y)
125 |                 study._log_completed_trial(y if isinstance(y, list) else [y], frozentrial.number, frozentrial.params)
126 |                 finished_job_count += 1
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     main()
131 | 


--------------------------------------------------------------------------------
/aiaccel/config/git.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
  2 | # SPDX-License-Identifier: MIT
  3 | 
  4 | from dataclasses import dataclass
  5 | import importlib.util
  6 | import os
  7 | from pathlib import Path
  8 | import subprocess
  9 | 
 10 | from omegaconf import DictConfig, ListConfig
 11 | 
 12 | __all__ = [
 13 |     "PackageGitStatus",
 14 |     "collect_git_status_from_config",
 15 |     "print_git_status",
 16 | ]
 17 | 
 18 | 
 19 | @dataclass
 20 | class PackageGitStatus:
 21 |     """
 22 |     Represents the Git status of a package.
 23 | 
 24 |     Attributes:
 25 |         package_name (str): The name of the package.
 26 |         commit_id (str): The current Git commit ID of the repository.
 27 |         status (list[str]): A list of uncommitted files in the repository.
 28 |     """
 29 | 
 30 |     package_name: str
 31 |     commit_id: str
 32 |     status: list[str]
 33 | 
 34 |     def ready(self) -> bool:
 35 |         """
 36 |         Determines if there are no uncommitted changes.
 37 | 
 38 |         Returns:
 39 |             bool: True if there are no uncommitted files, otherwise False.
 40 |         """
 41 | 
 42 |         return len(self.status) == 0
 43 | 
 44 | 
 45 | def collect_git_status_from_config(config: DictConfig | ListConfig) -> list[PackageGitStatus]:
 46 |     """
 47 |     Collects the Git status of packages specified in the given configuration.
 48 | 
 49 |     Args:
 50 |         config (DictConfig | ListConfig): The configuration containing package references.
 51 | 
 52 |     Returns:
 53 |         list[PackageGitStatus]: A list of `PackageGitStatus` objects representing
 54 |                                 the Git status of the detected packages.
 55 |     """
 56 | 
 57 |     status_list = []
 58 | 
 59 |     package_names = collect_target_packages(config)
 60 |     package_names.sort()
 61 | 
 62 |     for package_name in package_names:
 63 |         status = get_git_status(package_name)
 64 | 
 65 |         if status is not None:
 66 |             status_list.append(status)
 67 | 
 68 |     return status_list
 69 | 
 70 | 
 71 | def print_git_status(status: PackageGitStatus | list[PackageGitStatus]) -> None:
 72 |     """
 73 |     Prints the Git status of a package or a list of packages.
 74 | 
 75 |     Args:
 76 |         status (PackageGitStatus | list[PackageGitStatus]): The Git status to print.
 77 |     """
 78 | 
 79 |     status_list = status if isinstance(status, list) else [status]
 80 | 
 81 |     for status in status_list:
 82 |         print(f"{status.package_name} @ {status.commit_id}")
 83 |         for st in status.status:
 84 |             print(f"  {st}")
 85 | 
 86 | 
 87 | def get_git_status(package_name: str) -> PackageGitStatus | None:
 88 |     """
 89 |     Retrieves the Git status of a given package.
 90 | 
 91 |     Args:
 92 |         package_name (str): The name of the package to check.
 93 | 
 94 |     Returns:
 95 |         PackageGitStatus | None: A `PackageGitStatus` object if the package is found
 96 |                                  and under Git control, otherwise None.
 97 |     """
 98 | 
 99 |     # get package location
100 |     spec = importlib.util.find_spec(package_name)
101 | 
102 |     if spec is None:
103 |         return None
104 | 
105 |     if spec.origin is not None:
106 |         module_path = Path(spec.origin).parent.resolve()
107 |     elif spec.submodule_search_locations is not None:
108 |         module_path = Path(os.path.abspath(spec.submodule_search_locations[0])).resolve()
109 |     else:
110 |         return None
111 | 
112 |     # get repository path
113 |     result = subprocess.run(["git", "rev-parse", "--show-toplevel"], cwd=module_path, capture_output=True, text=True)
114 |     if result.returncode != 0:
115 |         return None
116 | 
117 |     repository_path = Path(result.stdout.splitlines()[0]).resolve()
118 | 
119 |     # check git_ignore
120 |     result = subprocess.run(["git", "check-ignore", module_path], cwd=repository_path, capture_output=True, text=True)
121 |     if result.returncode == 0:
122 |         return None
123 | 
124 |     # get commit id
125 |     result = subprocess.run(["git", "rev-parse", "HEAD"], cwd=repository_path, capture_output=True, text=True)
126 |     commit_id = result.stdout.splitlines()[0]
127 | 
128 |     # check git status
129 |     result = subprocess.run(["git", "status", "-s"], cwd=repository_path, capture_output=True, text=True)
130 |     status = result.stdout.splitlines()
131 | 
132 |     return PackageGitStatus(package_name, commit_id, status)
133 | 
134 | 
135 | def collect_target_packages(config: ListConfig | DictConfig) -> list[str]:
136 |     """
137 |     Extracts the names of target packages from the given configuration.
138 | 
139 |     Args:
140 |         config (ListConfig | DictConfig): The configuration to process.
141 | 
142 |     Returns:
143 |         list[str]: A list of package names extracted from the configuration.
144 |     """
145 | 
146 |     target_packages = set()
147 | 
148 |     def inner_func(_config: ListConfig | DictConfig) -> None:
149 |         if isinstance(_config, DictConfig):
150 |             for key, value in _config.items():
151 |                 if key == "_target_":
152 |                     package_name, *_ = value.split(".")
153 |                     target_packages.add(package_name)
154 | 
155 |                 inner_func(value)
156 | 
157 |         elif isinstance(_config, ListConfig):
158 |             for item in _config:
159 |                 inner_func(item)
160 | 
161 |     inner_func(config)
162 | 
163 |     return list(target_packages)
164 | 


--------------------------------------------------------------------------------
/aiaccel/torch/lightning/opt_lightning_module.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
  2 | # SPDX-License-Identifier: MIT
  3 | 
  4 | from __future__ import annotations
  5 | 
  6 | from typing import Any
  7 | 
  8 | from collections.abc import Callable, Iterator
  9 | from dataclasses import dataclass
 10 | from fnmatch import fnmatch
 11 | 
 12 | from torch import nn, optim
 13 | 
 14 | import lightning as lt
 15 | from lightning.pytorch.utilities.types import OptimizerLRSchedulerConfig
 16 | 
 17 | 
 18 | @dataclass
 19 | class OptimizerConfig:
 20 |     """
 21 |     Configuration for the optimizer and scheduler in a LightningModule.
 22 | 
 23 |     Args:
 24 |         optimizer_generator (Callable[..., optim.Optimizer]): A callable that generates the optimizer.
 25 |         params_transformer (Callable[..., Iterator[tuple[str, Any]]] | None): A callable that transforms the parameters
 26 |             into a format suitable for the optimizer. If None, the parameters are used as is. Defaults to None.
 27 |         scheduler_generator (Callable[..., optim.lr_scheduler.LRScheduler] | None):
 28 |             A callable that generates the learning rate scheduler. If None, no scheduler is used. Defaults to None.
 29 |         scheduler_interval (str | None): The interval at which the scheduler is called. Defaults to "step".
 30 |         scheduler_monitor (str | None): The metric to monitor for the scheduler. Defaults to "validation/loss".
 31 |     """
 32 | 
 33 |     optimizer_generator: Callable[..., optim.Optimizer]
 34 |     params_transformer: Callable[..., Iterator[tuple[str, Any]]] | None = None
 35 | 
 36 |     scheduler_generator: Callable[..., optim.lr_scheduler.LRScheduler] | None = None
 37 |     scheduler_interval: str | None = "step"
 38 |     scheduler_monitor: str | None = "validation/loss"
 39 | 
 40 | 
 41 | def build_param_groups(
 42 |     named_params: Iterator[tuple[str, nn.Parameter]],
 43 |     groups: list[dict[str, Any]],
 44 | ) -> list[dict[str, Any]]:
 45 |     """
 46 |     Build parameter groups for the optimizer based on the provided patterns.
 47 | 
 48 |     Args:
 49 |         named_params (Iterator[tuple[str, nn.Parameter]]): An iterator of named parameters.
 50 |         groups (list[dict[str, Any]]): A list of dictionaries where each dictionary contains
 51 |             a "pattern" key that specifies the parameter names to match (``fnmatch``), and other optional keys.
 52 | 
 53 |     Example:
 54 |     In your config file, you might have:
 55 | 
 56 |     .. code-block:: yaml
 57 | 
 58 |         optimizer_config:
 59 |           _target_: aiaccel.torch.lightning.OptimizerConfig
 60 |           optimizer_generator:
 61 |             _partial_: True
 62 |             _target_: torch.optim.AdamW
 63 |             weight_decay: 0.01
 64 |           params_transformer:
 65 |               _partial_: True
 66 |               _target_: aiaccel.torch.lightning.build_param_groups
 67 |               groups:
 68 |                 - pattern: "*bias"
 69 |                   lr: 0.01
 70 |                 - pattern: "*weight"
 71 |                   lr: 0.001
 72 | 
 73 |     This will create two parameter groups: one for biases with a learning rate of 0.01 and another for weights with
 74 |     a learning rate of 0.001.
 75 |     """
 76 |     remaining = dict(named_params)
 77 | 
 78 |     param_groups = []
 79 |     for spec in groups:
 80 |         matched_params = []
 81 |         for target in [spec["pattern"]] if isinstance(spec["pattern"], str) else spec["pattern"]:
 82 |             matched_params += [remaining.pop(name) for name in list(remaining.keys()) if fnmatch(name, target)]
 83 | 
 84 |         assert len(matched_params) > 0
 85 | 
 86 |         param_groups.append({"params": matched_params} | {k: v for k, v in spec.items() if k != "pattern"})
 87 | 
 88 |     param_groups.append({"params": list(remaining.values())})
 89 | 
 90 |     return param_groups
 91 | 
 92 | 
 93 | class OptimizerLightningModule(lt.LightningModule):
 94 |     """
 95 |     LightningModule subclass for models that use custom optimizers and schedulers.
 96 | 
 97 |     Args:
 98 |         optimizer_config (OptimizerConfig): Configuration object for the optimizer.
 99 | 
100 |     Attributes:
101 |         optcfg (OptimizerConfig): Configuration object for the optimizer.
102 | 
103 |     Methods:
104 |         configure_optimizers: Configures the optimizer and scheduler for training.
105 |     """
106 | 
107 |     def __init__(self, optimizer_config: OptimizerConfig):
108 |         super().__init__()
109 | 
110 |         self._optimizer_config = optimizer_config
111 | 
112 |     def configure_optimizers(self) -> optim.Optimizer | OptimizerLRSchedulerConfig:
113 |         """
114 |         Configures the optimizer and scheduler for training.
115 | 
116 |         Returns:
117 |             Union[optim.Optimizer, OptimizerLRSchedulerConfig]: The optimizer and scheduler configuration.
118 |         """
119 | 
120 |         params: Iterator[tuple[str, Any]] | Iterator[nn.Parameter]
121 |         if self._optimizer_config.params_transformer is None:
122 |             params = self.parameters()  # just because backward compatibility
123 |         else:
124 |             params = self._optimizer_config.params_transformer(self.named_parameters())
125 | 
126 |         optimizer = self._optimizer_config.optimizer_generator(params=params)  #
127 | 
128 |         if self._optimizer_config.scheduler_generator is None:
129 |             return optimizer
130 |         else:
131 |             assert self._optimizer_config.scheduler_interval is not None
132 |             assert self._optimizer_config.scheduler_monitor is not None
133 |             return {
134 |                 "optimizer": optimizer,
135 |                 "lr_scheduler": {
136 |                     "scheduler": self._optimizer_config.scheduler_generator(optimizer=optimizer),
137 |                     "interval": self._optimizer_config.scheduler_interval,
138 |                     "monitor": self._optimizer_config.scheduler_monitor,
139 |                 },
140 |             }
141 | 


--------------------------------------------------------------------------------
/aiaccel/torch/lightning/callbacks/load_pretrained.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
  2 | # SPDX-License-Identifier: MIT
  3 | 
  4 | from __future__ import annotations
  5 | 
  6 | from fnmatch import fnmatch
  7 | import logging
  8 | from pathlib import Path
  9 | import re
 10 | 
 11 | import torch
 12 | 
 13 | import lightning as lt
 14 | 
 15 | from aiaccel.torch.lightning import load_checkpoint
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | class LoadPretrainedCallback(lt.Callback):
 21 |     """Initialize a model from a pretrained checkpoint before training or validation.
 22 | 
 23 |     The callback loads weights from ``model_path`` once fitting or validation begins,
 24 |     matches finetune parameters to pretrained ones using glob-like patterns, and copies
 25 |     the matching weights into the finetune module before any optimization steps run.
 26 | 
 27 |     Args:
 28 |         model_path: Directory containing checkpoints saved by :func:`load_checkpoint`.
 29 |         target_patterns: Glob expressions that describe finetune parameters which
 30 |             should be initialized from pretrained weights.
 31 |         pattern_map: Optional mapping from finetune patterns to pretrained ones.
 32 |             Wildcards (``"*"``) are allowed and must appear the same number of times
 33 |             on both sides of the mapping.
 34 |         source_excludes: Optional pretrained-side glob patterns that should never be
 35 |             copied even when referenced by ``pattern_map``.
 36 |         target_excludes: Optional finetune-side glob patterns that should never be
 37 |             overwritten.
 38 |         config_name: Name of the checkpoint configuration to load.
 39 | 
 40 |     Example::
 41 | 
 42 |         callback = LoadPretrainedCallback(
 43 |             model_path=Path("pretrain_ckpt"),
 44 |             target_patterns=["detr_module.*"],
 45 |             pattern_map={"backbone.*": "visual_backbone.*"},
 46 |             source_excludes=["detr_module.heads.cls_head.*"],
 47 |             config_name="merged_config.yaml",
 48 |         )
 49 |         trainer = lt.Trainer(callbacks=[callback])
 50 |         trainer.fit(model)
 51 |     """
 52 | 
 53 |     def __init__(
 54 |         self,
 55 |         model_path: Path,
 56 |         target_patterns: list[str],
 57 |         pattern_map: dict[str, str] | None = None,
 58 |         source_excludes: list[str] | None = None,
 59 |         target_excludes: list[str] | None = None,
 60 |         config_name: str = "merged_config.yaml",
 61 |     ) -> None:
 62 |         super().__init__()
 63 | 
 64 |         # remember configuration about where to load and which config to use
 65 |         self.model_path = Path(model_path)
 66 |         self.config_name = config_name
 67 | 
 68 |         pattern_map = pattern_map or {}
 69 |         assert set(pattern_map) <= set(target_patterns)
 70 | 
 71 |         # build pattern dictionary used to match finetune parameters to pretrained ones
 72 |         pattern_dict = {ptn: ptn for ptn in target_patterns}
 73 |         pattern_dict.update(pattern_map)
 74 | 
 75 |         # remember exclusion filters for finetune and pretrained parameters
 76 |         self.source_excludes = source_excludes or []
 77 |         self.target_excludes = target_excludes or []
 78 | 
 79 |         # cache the derived mappings and bookkeeping flags
 80 |         self._ptn_dict = pattern_dict
 81 |         self._loaded = False
 82 | 
 83 |     @torch.no_grad()
 84 |     def on_fit_start(self, trainer: lt.Trainer, pl_module: lt.LightningModule) -> None:  # type: ignore[override]
 85 |         """Load pretrained weights and copy them into matching finetune parameters."""
 86 |         if self._loaded:
 87 |             return
 88 | 
 89 |         # load pretrained checkpoint and copy it to CPU tensors
 90 |         src_model, *_ = load_checkpoint(self.model_path, self.config_name, device="cpu")
 91 |         src_state_dict = {name: weight.cpu() for name, weight in src_model.state_dict().items()}
 92 |         dst_state_dict = dict(pl_module.state_dict())
 93 | 
 94 |         # iterate over each user-defined pattern rule
 95 |         for dst_ptn, src_ptn in self._ptn_dict.items():
 96 |             assert dst_ptn.count("*") == src_ptn.count("*")
 97 |             rgx_ptn = re.compile("^" + re.escape(dst_ptn).replace(r"\*", "(.*)") + "$")
 98 |             update_state: dict[str, torch.Tensor] = {}
 99 | 
100 |             # look for finetune parameters matching the current rule
101 |             for dst_name, dst_weight in dst_state_dict.items():
102 |                 match_ptn = rgx_ptn.fullmatch(dst_name)
103 |                 if not match_ptn:
104 |                     continue
105 |                 if any(fnmatch(dst_name, ptn) for ptn in self.target_excludes):
106 |                     continue
107 | 
108 |                 groups = iter(match_ptn.groups())
109 |                 src_name = "".join(next(groups) if ch == "*" else ch for ch in src_ptn)
110 | 
111 |                 # ensure we only pull parameters that are not excluded
112 |                 if any(fnmatch(src_name, ptn) for ptn in self.source_excludes):
113 |                     continue
114 | 
115 |                 # fetch pretrained tensor and check compatibility before scheduling update
116 |                 src_weight = src_state_dict.get(src_name)
117 |                 assert src_weight is not None, (
118 |                     f"Pretrained key not found: pretrained['{src_name}'] (for finetune['{dst_name}'])."
119 |                 )
120 |                 assert src_weight.shape == dst_weight.shape
121 | 
122 |                 update_state[dst_name] = src_weight
123 | 
124 |                 logger.debug(f"Parameter '{dst_name}' initialized from '{src_name}' in checkpoint.")
125 | 
126 |             # apply the collected updates for this rule and mark them as assigned
127 |             assert update_state, f"No parameters matched rule: '{dst_ptn}' -> '{src_ptn}'."
128 |             pl_module.load_state_dict(update_state, strict=False)
129 | 
130 |             for dst_name in update_state:
131 |                 dst_state_dict.pop(dst_name)
132 | 
133 |         # prevent re-loading so weights are only imported once
134 |         self._loaded = True
135 | 
136 |     def on_validation_start(self, trainer: lt.Trainer, pl_module: lt.LightningModule) -> None:  # type: ignore[override]
137 |         """Ensure pretrained weights are loaded before running validation."""
138 |         self.on_fit_start(trainer, pl_module)
139 | 


--------------------------------------------------------------------------------
/aiaccel/torch/h5py/hdf5_writer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
  2 | # SPDX-License-Identifier: MIT
  3 | 
  4 | import numpy.typing as npt
  5 | from typing import Any, Generic, TypeVar
  6 | 
  7 | from abc import ABCMeta, abstractmethod
  8 | from functools import reduce
  9 | import json
 10 | from math import ceil
 11 | from pathlib import Path
 12 | 
 13 | from rich.progress import track
 14 | 
 15 | import h5py
 16 | 
 17 | T1 = TypeVar("T1")
 18 | T2 = TypeVar("T2")
 19 | 
 20 | 
 21 | class HDF5Writer(Generic[T1, T2], metaclass=ABCMeta):
 22 |     """
 23 |     Abstract base class for writing data to an HDF5 file.
 24 | 
 25 |     This class provides methods to write data into HDF5 format, supporting both
 26 |     single-process and parallel (MPI-based) writing. Subclasses must implement
 27 |     `prepare_globals` and `prepare_group` to define how data is structured.
 28 | 
 29 |     Typical usage is supposed to be:
 30 | 
 31 |     .. code-block:: python
 32 | 
 33 |         class FooHDF5Writer(HDF5Writer):
 34 |             def prepare_globals(self):
 35 |                 item_list = list(range(100))
 36 | 
 37 |                 offset = 10
 38 |                 maximum = 50
 39 | 
 40 |                 return item_list, (offset, maximum)
 41 | 
 42 |             def prepare_group(self, item, context):
 43 |                 offset, maximum = context
 44 | 
 45 |                 group_name = f"{item:04d}
 46 | 
 47 |                 return {group_name: {"data": np.full([10, 10], offset + item).clip(maximum)}}
 48 | 
 49 |         writer = FooHDF5Writer()
 50 |         writer.write("test.hdf5", parallel=False)
 51 |     """
 52 | 
 53 |     h5: h5py.File
 54 | 
 55 |     def _write(self, filename: Path) -> None:
 56 |         """
 57 |         Write data to an HDF5 file using a single process.
 58 | 
 59 |         Args:
 60 |             filename (Path): Path to the output HDF5 file.
 61 |         """
 62 | 
 63 |         # prepare globals
 64 |         items, context = self.prepare_globals()
 65 |         group_list = []
 66 | 
 67 |         # write into hdf5 file
 68 |         with h5py.File(filename, "w") as h5:
 69 |             for item in track(items):
 70 |                 groups = self.prepare_group(item, context)
 71 | 
 72 |                 for group_name, datasets in groups.items():
 73 |                     g = h5.create_group(group_name)
 74 | 
 75 |                     for dataset_name, data in datasets.items():
 76 |                         ds = g.create_dataset(dataset_name, data.shape, dtype=data.dtype)
 77 |                         ds[:] = data
 78 | 
 79 |                     group_list.append(group_name)
 80 | 
 81 |         with open(filename.with_suffix(".json"), "w") as f:
 82 |             json.dump(group_list, f)
 83 | 
 84 |     def _write_parallel(self, filename: Path) -> None:
 85 |         """
 86 |         Write data to an HDF5 file using MPI for parallel processing.
 87 | 
 88 |         Args:
 89 |             filename (Path): Path to the output HDF5 file.
 90 |         """
 91 | 
 92 |         # prepare MPI
 93 |         from mpi4py.MPI import COMM_WORLD
 94 | 
 95 |         comm = COMM_WORLD
 96 | 
 97 |         rank = comm.Get_rank()
 98 |         size = comm.Get_size()
 99 | 
100 |         # prepare globals
101 |         if rank == 0:
102 |             items, context = self.prepare_globals()
103 |             items = list(items) + (ceil(len(items) / size) * size - len(items)) * [None]
104 | 
105 |             globals_ = items, context
106 |         else:
107 |             globals_ = None
108 | 
109 |         items, context = comm.bcast(globals_, root=0)
110 |         group_list = []
111 | 
112 |         # write into hdf5 file
113 |         with h5py.File(filename, "w", driver="mpio", comm=comm) as h5:
114 |             track_ = track if rank == 0 else lambda x, **kwargs: x
115 |             for item in track_(items[rank::size]):
116 |                 groups = self.prepare_group(item, context) if item is not None else {}
117 | 
118 |                 groups_info = {}
119 |                 for group_name, datasets in groups.items():
120 |                     groups_info[group_name] = {dset: (data.shape, data.dtype) for dset, data in datasets.items()}
121 | 
122 |                 for group_name, datasets in reduce(dict.__or__, comm.allgather(groups_info)).items():
123 |                     g = h5.create_group(group_name)
124 | 
125 |                     for dataset_name, (shape, dtype) in datasets.items():
126 |                         g.create_dataset(dataset_name, shape, dtype=dtype)
127 | 
128 |                     group_list.append(group_name)
129 | 
130 |                 for group_name, datasets in groups.items():
131 |                     g = h5[group_name]  # type: ignore
132 | 
133 |                     for dataset_name, data in datasets.items():
134 |                         g[dataset_name][:] = data  # type: ignore
135 | 
136 |         if rank == 0:
137 |             with open(filename.with_suffix(".json"), "w") as f:
138 |                 json.dump(group_list, f)
139 | 
140 |     def write(self, filename: Path, parallel: bool = False) -> None:
141 |         """
142 |         Write data to an HDF5 file, optionally using parallel processing.
143 | 
144 |         Args:
145 |             filename (Path): Path to the output HDF5 file.
146 |             parallel (bool, optional): Whether to use parallel writing. Defaults to False.
147 |         """
148 | 
149 |         if not parallel:
150 |             self._write(filename)
151 |         else:
152 |             self._write_parallel(filename)
153 | 
154 |     @abstractmethod
155 |     def prepare_globals(self) -> tuple[list[T1], T2]:
156 |         """
157 |         Prepare the global data required for writing.
158 | 
159 |         This method must be implemented by subclasses to provide the data items
160 |         and any necessary context for processing.
161 | 
162 |         Returns:
163 |             tuple[list[T1], T2]: A tuple containing a list of data items and
164 |             context information.
165 |         """
166 |         pass
167 | 
168 |     @abstractmethod
169 |     def prepare_group(self, item: T1, context: T2) -> dict[str, dict[str, npt.NDArray[Any]]]:
170 |         """
171 |         Prepare groups of datasets for writing to HDF5.
172 | 
173 |         This method must be implemented by subclasses to define how individual
174 |         data items should be structured within the HDF5 file.
175 | 
176 |         Args:
177 |             item (T1): A single data item.
178 |             context (T2): Additional context for processing.
179 | 
180 |         Returns:
181 |             dict[str, dict[str, npt.NDArray[Any]]]: A dictionary mapping group names
182 |             to dataset dictionaries.
183 |         """
184 |         pass
185 | 


--------------------------------------------------------------------------------
/examples/hpo/benchmark/experiment_coco.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright (C) 2025 National Institute of Advanced Industrial Science and Technology (AIST)
  3 | # SPDX-License-Identifier: MIT
  4 | 
  5 | from typing import Any
  6 | 
  7 | import argparse
  8 | from collections.abc import Callable
  9 | from concurrent.futures import ThreadPoolExecutor
 10 | import csv
 11 | import os
 12 | import time
 13 | 
 14 | import pandas as pd
 15 | 
 16 | import cocoex
 17 | import optuna
 18 | 
 19 | from aiaccel.hpo.optuna.samplers.nelder_mead_sampler import NelderMeadEmptyError, NelderMeadSampler
 20 | 
 21 | 
 22 | def _optimize_sequential(
 23 |     study: optuna.Study, func: Callable[[list[float]], float], search_space: dict[str, tuple[int | float, int | float]]
 24 | ) -> float | None:
 25 |     try:
 26 |         trial = study.ask()
 27 |     except NelderMeadEmptyError:
 28 |         return None
 29 |     param = []
 30 |     for name, distribution in search_space.items():
 31 |         param.append(trial.suggest_float(name, *distribution))
 32 | 
 33 |     result = func(param)
 34 |     time.sleep(0.1)
 35 | 
 36 |     frozentrial = study.tell(trial, result)
 37 |     study._log_completed_trial([result], frozentrial.number, frozentrial.params)
 38 |     return result
 39 | 
 40 | 
 41 | def _optimize_sequential_wrapper(args: list[Any]) -> float | None:
 42 |     return _optimize_sequential(*args)
 43 | 
 44 | 
 45 | def optimize(
 46 |     study: optuna.Study,
 47 |     func: Callable[[list[float]], float],
 48 |     search_space: dict[str, tuple[int | float, int | float]],
 49 |     result_csv_name: str,
 50 |     num_trial: int = 1000,
 51 |     num_parallel: int = 10,
 52 | ) -> None:
 53 |     csv_array: list[list[str | float]] = [["step", "value"]]
 54 | 
 55 |     with ThreadPoolExecutor(max_workers=num_parallel) as executor:
 56 |         for step in range(int(num_trial / num_parallel)):
 57 |             results = executor.map(
 58 |                 _optimize_sequential_wrapper, [(study, func, search_space) for _ in range(num_parallel)]
 59 |             )
 60 |             for result in results:
 61 |                 if result is not None:
 62 |                     csv_array.append([step, result])
 63 | 
 64 |     with open(result_csv_name, "w") as f:
 65 |         writer = csv.writer(f)
 66 |         writer.writerows(csv_array)
 67 | 
 68 | 
 69 | def create_optuna_result(
 70 |     study: optuna.Study, output_folder: str, problem: Any, optuna_seed: int, sampler_name: str
 71 | ) -> None:
 72 |     study_df = study.trials_dataframe()
 73 |     result_dir = f"{sampler_name}/optuna_csv/{output_folder}/f{problem.id_function}/DM{problem.dimension:02}"
 74 |     os.makedirs(result_dir, exist_ok=True)
 75 |     study_df.to_csv(result_dir + f"/result_{problem.id}_{optuna_seed:03}.csv")
 76 | 
 77 | 
 78 | def experiment_bbob() -> None:
 79 |     parser = argparse.ArgumentParser()
 80 |     parser.add_argument("--func_id")
 81 |     parser.add_argument("--dim")
 82 |     parser.add_argument("--instance")
 83 |     parser.add_argument("--optuna_seed")
 84 |     parser.add_argument("--sampler_name")
 85 |     args, _ = parser.parse_known_args()
 86 | 
 87 |     func_id = int(args.func_id)
 88 |     dim = int(args.dim)
 89 |     instance = int(args.instance)
 90 |     optuna_seed = int(args.optuna_seed)
 91 |     sampler_name = args.sampler_name
 92 | 
 93 |     ### input
 94 |     suite_name = "bbob"
 95 |     output_folder = f"optuna-{sampler_name}-func_id{func_id}-dim{dim}-instance{instance}"
 96 |     budget_multiplier = 200  # increase to 10, 100, ...
 97 | 
 98 |     ### prepare
 99 |     suite_options = f"function_indices: {func_id} dimensions: {dim} instance_indices: {instance}"
100 |     print(suite_options)
101 |     suite = cocoex.Suite(suite_name, "", suite_options)
102 |     observer = cocoex.Observer(suite_name, "result_folder: " + output_folder)
103 |     minimal_print = cocoex.utilities.MiniPrint()
104 | 
105 |     num_parallel = 10
106 | 
107 |     # ### go
108 |     for problem in suite:  # this loop will take several minutes or longer
109 |         problem.observe_with(observer)  # generates the data for cocopp post-processing
110 | 
111 |         search_space: dict[str, tuple[int | float, int | float]] = {}
112 |         for i in range(problem.dimension):
113 |             search_space[f"x{i}"] = (-5.0, 5.0)
114 |         print(search_space)
115 | 
116 |         if sampler_name == "nelder-mead":
117 |             # Nelder Mead(no sub sampler)
118 |             study = optuna.create_study(
119 |                 sampler=NelderMeadSampler(search_space=search_space, seed=optuna_seed, block=False)
120 |             )
121 |         elif sampler_name == "nelder-mead-subTPE":
122 |             # NM+subTPE
123 |             sub_sampler = optuna.samplers.TPESampler(seed=optuna_seed, consider_magic_clip=True, multivariate=False)
124 |             study = optuna.create_study(
125 |                 sampler=NelderMeadSampler(
126 |                     search_space=search_space, seed=optuna_seed, block=False, sub_sampler=sub_sampler
127 |                 )
128 |             )
129 |         elif sampler_name == "TPE":
130 |             # TPE
131 |             study = optuna.create_study(
132 |                 sampler=optuna.samplers.TPESampler(seed=optuna_seed, consider_magic_clip=True, multivariate=False)
133 |             )
134 |         else:
135 |             raise ValueError(f"{sampler_name} is not defined.")
136 | 
137 |         num_trial = budget_multiplier * problem.dimension
138 |         step_csv_dir = f"{sampler_name}/step_csv/{output_folder}/f{problem.id_function}/DM{problem.dimension:02}/"
139 |         os.makedirs(step_csv_dir, exist_ok=True)
140 |         optimize(
141 |             study,
142 |             problem,
143 |             search_space,
144 |             step_csv_dir + f"result_{problem.id}_{optuna_seed:03}.csv",
145 |             num_trial,
146 |             num_parallel,
147 |         )
148 | 
149 |         create_optuna_result(study, output_folder, problem, optuna_seed, sampler_name)
150 | 
151 |         optuna_seed += 1
152 | 
153 |         minimal_print(problem, final=problem.index == len(suite) - 1)
154 | 
155 |     # result - f_opt
156 |     for i, problem in enumerate(suite):
157 |         coco_file_path = (
158 |             "exdata/"
159 |             + f"{output_folder}/"
160 |             + f"data_f{problem.id_function}/bbobexp_f{problem.id_function}_DIM{problem.dimension}.rdat"
161 |         )
162 | 
163 |         with open(coco_file_path) as f:
164 |             data = f.readlines()
165 | 
166 |         f_opt = float(data[i % 15].split(" ")[12][1:-1])
167 |         print(f_opt)
168 | 
169 |         optuna_result_dir = (
170 |             f"{sampler_name}/optuna_csv/{output_folder}/f{problem.id_function}/DM{problem.dimension:02}/"
171 |         )
172 |         optuna_seed = i + 1
173 |         df = pd.read_csv(optuna_result_dir + f"result_{problem.id}_{optuna_seed:03}.csv")
174 |         df["value - f_opt"] = df["value"] - f_opt
175 | 
176 |         print(df)
177 |         df.to_csv(optuna_result_dir + f"result_{problem.id}_{optuna_seed:03}_fopt.csv")
178 | 
179 |         step_csv_dir = f"{sampler_name}/step_csv/{output_folder}/f{problem.id_function}/DM{problem.dimension:02}/"
180 |         df = pd.read_csv(step_csv_dir + f"result_{problem.id}_{optuna_seed:03}.csv")
181 |         df["value - f_opt"] = df["value"] - f_opt
182 | 
183 |         print(df)
184 |         df.to_csv(step_csv_dir + f"result_{problem.id}_{optuna_seed:03}_fopt.csv")
185 | 
186 | 
187 | if __name__ == "__main__":
188 |     experiment_bbob()
189 | 


--------------------------------------------------------------------------------