├── .python-version
├── use-cases
    ├── virgo
    │   ├── .gitignore
    │   ├── requirements.txt
    │   ├── synthetic-data-gen
    │   │   └── data_generation_hdf5.sh
    │   └── slurm_config.yaml
    ├── cyclones
    │   ├── requirements.txt
    │   ├── .gitignore
    │   ├── src
    │   │   ├── strategy.py
    │   │   └── transform.py
    │   ├── README.md
    │   ├── pipeline.yaml
    │   └── startscript.sh
    ├── eurac
    │   ├── .gitignore
    │   ├── requirements.txt
    │   ├── slurm_config.yaml
    │   └── data.py
    ├── xtclim
    │   ├── src
    │   │   ├── .DS_Store
    │   │   ├── initialization.py
    │   │   └── utils.py
    │   ├── outputs
    │   │   └── .DS_Store
    │   ├── preprocessing
    │   │   └── .DS_Store
    │   ├── requirements.txt
    │   └── train.py
    ├── 3dgan
    │   ├── requirements.txt
    │   ├── Dockerfile
    │   ├── create_inference_sample.py
    │   ├── downsample_h5py_file.py
    │   ├── run-provenance-experiments.sh
    │   ├── slurm.jsc.sh
    │   └── slurm.vega.sh
    ├── radio-astronomy
    │   ├── .gitignore
    │   ├── clean
    │   ├── requirements.txt
    │   └── .pytest-clean
    ├── mnist
    │   ├── tensorflow
    │   │   ├── README.md
    │   │   ├── startscript.sh
    │   │   ├── pipeline.yaml
    │   │   └── dataloader.py
    │   ├── torch
    │   │   ├── slurm_config.yaml
    │   │   ├── Dockerfile
    │   │   ├── create_inference_sample.py
    │   │   └── saver.py
    │   └── torch-lightning
    │   │   ├── README.md
    │   │   └── startscript
    ├── README.md
    └── lattice-qcd
    │   ├── train.py
    │   ├── setup.py
    │   └── config.yaml
├── docs
    ├── requirements.txt
    ├── tutorials
    │   ├── workflows
    │   │   ├── 03-dag-workflows
    │   │   ├── 01-pipeline-introduction
    │   │   ├── 02-pipeline-configuration
    │   │   └── 04_itwinai_argparser.rst
    │   ├── distrib-ml
    │   │   ├── torch-tutorial-GAN.rst
    │   │   ├── torch_tutorial_0_basics.rst
    │   │   ├── tf_tutorial_1_imagenet.rst
    │   │   ├── torch_tutorial_1_mnist.rst
    │   │   ├── tf_tutorial_0_basics.rst
    │   │   ├── torch_tutorial_kubeflow_1.rst
    │   │   ├── tf_scaling_test.rst
    │   │   ├── kuberay-setup-tutorial.rst
    │   │   ├── torch_scaling_test.rst
    │   │   └── torch-tutorial-containers.rst
    │   └── tutorials.rst
    ├── getting-started
    │   ├── plugins.rst
    │   └── plugins-list.rst
    ├── images
    │   ├── icon-itwinai-orange.png
    │   ├── icon-itwinai-white.png
    │   ├── icon-itwinai-orange-white.png
    │   ├── icon-itwinai-orange-black-subtitle.png
    │   ├── icon-itwinai-orange-white-subtitle.png
    │   ├── icon-itwinai-orange-black-subtitle-small.png
    │   └── scalability-plots
    │   │   ├── mnist
    │   │       ├── absolute_epoch_time.png
    │   │       ├── computation_vs_other_plot.png
    │   │       └── relative_epoch_time_speedup.png
    │   │   ├── virgo
    │   │       ├── absolute_epoch_time.png
    │   │       ├── computation_vs_other_plot.png
    │   │       ├── outdated
    │   │       │   ├── gpu_energy_plot.png
    │   │       │   ├── utilization_plot.png
    │   │       │   ├── communication_plot.png
    │   │       │   ├── absolute_scalability_plot.png
    │   │       │   └── relative_scalability_plot.png
    │   │       └── relative_epoch_time_speedup.png
    │   │   └── eurac
    │   │       └── outdated
    │   │           ├── gpu_energy_plot.png
    │   │           ├── utilization_plot.png
    │   │           ├── communication_plot.png
    │   │           ├── absolute_scalability_plot.png
    │   │           └── relative_scalability_plot.png
    ├── how-it-works
    │   ├── loggers
    │   │   └── figures
    │   │   │   └── logger_fig.png
    │   ├── workflows
    │   │   └── figures
    │   │   │   ├── comp_Get.png
    │   │   │   ├── comp_Adapt.png
    │   │   │   ├── comp_Proc.png
    │   │   │   ├── comp_Save.png
    │   │   │   ├── comp_Split.png
    │   │   │   ├── comp_Train.png
    │   │   │   ├── Adapt_example.png
    │   │   │   ├── comp_Predict.png
    │   │   │   ├── simple_pipeline.png
    │   │   │   └── Advanced_workflow.png
    │   └── training
    │   │   ├── training.rst
    │   │   └── explain_ddp.rst
    ├── api
    │   ├── cli.md
    │   ├── itwinai.loggers.rst
    │   ├── itwinai.type.rst
    │   ├── itwinai.utils.rst
    │   ├── itwinai.parser.rst
    │   ├── itwinai.distributed.rst
    │   ├── itwinai.pipeline.rst
    │   ├── itwinai.components.rst
    │   ├── itwinai.serialization.rst
    │   ├── cli_reference.rst
    │   ├── modules.rst
    │   ├── itwinai.tf.modules.rst
    │   ├── itwinai.tests.modules.rst
    │   ├── itwinai.scalability_report.modules.rst
    │   └── itwinai.torch.modules.rst
    ├── _static
    │   └── custom.css
    ├── use-cases
    │   ├── xtclim_doc.rst
    │   ├── use_cases.rst
    │   ├── mnist_doc.rst
    │   ├── latticeqcd_doc.rst
    │   ├── cyclones_doc.rst
    │   └── 3dgan_doc.rst
    ├── Makefile
    ├── make.bat
    ├── testing-with-pytest.md
    └── installation
    │   ├── user_installation.rst
    │   ├── software_prerequisites.rst
    │   └── post_itwinai_installation.rst
├── tutorials
    ├── ml-workflows
    │   ├── .gitignore
    │   ├── 03-dag-workflows
    │   │   └── Advanced_workflow.png
    │   ├── 01-pipeline-introduction
    │   │   ├── sample_pipeline_1.jpg
    │   │   └── sample_pipeline_2.jpg
    │   └── 04-itwinai-argparser
    │   │   └── README.md
    ├── distributed-ml
    │   ├── torch-tutorial-1-mnist
    │   │   ├── .gitignore
    │   │   ├── slurm_config.yaml
    │   │   ├── config.yaml
    │   │   └── README.md
    │   ├── torch-tutorial-2-trainer-class
    │   │   ├── slurm_config.yaml
    │   │   ├── README.md
    │   │   ├── sample_srun.sh
    │   │   └── sample_code.py
    │   ├── torch-scaling-test
    │   │   ├── img
    │   │   │   └── report.png
    │   │   ├── config
    │   │   │   ├── ddp.yaml
    │   │   │   ├── deepspeed.yaml
    │   │   │   ├── horovod.yaml
    │   │   │   └── base.yaml
    │   │   └── slurm_config.yaml
    │   ├── torch-tutorial-0-basics
    │   │   └── slurm_config.yaml
    │   ├── torch-tutorial-GAN
    │   │   └── slurm_config.yaml
    │   ├── tf-tutorial-0-basics
    │   │   ├── README.md
    │   │   └── tfmirrored_slurm.sh
    │   ├── tf-tutorial-1-imagenet
    │   │   ├── README.md
    │   │   └── tfmirrored_slurm.sh
    │   ├── torch-tutorial-containers
    │   │   ├── model.py
    │   │   ├── run_docker.sh
    │   │   ├── runall.sh
    │   │   └── config.yaml
    │   ├── torch-kubeflow-1
    │   │   ├── Dockerfile
    │   │   └── cpu.yaml
    │   └── tf-scaling-test-jube
    │   │   ├── README.md
    │   │   └── jube_ddp.sh
    └── hpo-workflows
    │   └── fashion-mnist
    │       └── config.yaml
├── .github
    ├── linters
    │   ├── .shellcheckrc
    │   ├── .isort.cfg
    │   ├── .jscpd.json
    │   ├── .flake8
    │   ├── .markdownlint.json
    │   ├── mlc_config.json
    │   ├── .hadolint.yaml
    │   └── .ruff.toml
    ├── dependabot.yml
    ├── PULL_REQUEST_TEMPLATE.md
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
    ├── workflows
    │   ├── check-links.yml
    │   ├── sqaaas.yml
    │   ├── pypi.yml
    │   └── pytest.yml
    └── ISSUE_TEMPLATE.md
├── ci
    ├── .gitattributes
    ├── .gitignore
    ├── dagger.json
    ├── src
    │   └── main
    │   │   ├── literals.py
    │   │   ├── utils.py
    │   │   └── __init__.py
    └── pyproject.toml
├── env-files
    ├── torch
    │   ├── requirements
    │   │   ├── requirements.txt
    │   │   ├── cmcc-requirements.txt
    │   │   └── README.md
    │   ├── jupyter
    │   │   ├── ipython_kernel_config.json
    │   │   ├── start-cloud.sh
    │   │   ├── README.md
    │   │   ├── setup.sh
    │   │   └── asyncssh_config.py
    │   ├── horovod-deepspeed-JSC.slurm
    │   ├── createEnvVega.sh
    │   ├── README.md
    │   ├── install-horovod-deepspeed-cuda.sh
    │   ├── generic_torch.sh
    │   └── createEnvJSC.sh
    ├── docs
    │   ├── build-docs-jsc.sh
    │   └── create-docs-env-jsc.sh
    └── tensorflow
    │   ├── createEnvVegaTF.sh
    │   ├── generic_tf.sh
    │   └── createEnvJSCTF.sh
├── .gitmodules
├── setup.cfg
├── COPYRIGHT
├── THIRD_PARTY_LICENSES
├── src
    └── itwinai
    │   ├── plugins
    │       └── __init__.py
    │   ├── torch
    │       ├── __init__.py
    │       ├── data
    │       │   └── __init__.py
    │       ├── models
    │       │   └── __init__.py
    │       ├── type.py
    │       └── reproducibility.py
    │   ├── tensorflow
    │       ├── __init__.py
    │       ├── data
    │       │   └── __init__.py
    │       ├── models
    │       │   ├── __init__.py
    │       │   └── mnist.py
    │       └── utils.py
    │   ├── tests
    │       ├── exceptions.py
    │       └── __init__.py
    │   ├── type.py
    │   ├── slurm
    │       ├── sample_slurm_config.yaml
    │       ├── slurm_constants.py
    │       └── slurm_script_configuration.py
    │   └── constants.py
├── CHANGELOG
├── CODEOWNERS
├── .vscode
    ├── extensions.json
    └── settings.json
├── tests
    ├── test_cli.py
    ├── torch
    │   └── test_config.py
    ├── run_on_jsc.sh
    ├── conftest.py
    ├── use-cases
    │   ├── conftest.py
    │   └── test_cyclones.py
    └── components
    │   └── conftest.py
├── MAINTAINERS.md
├── .readthedocs.yaml
├── .dockerignore
└── CITATION.cff


/.python-version:
--------------------------------------------------------------------------------
1 | 3.12
2 | 


--------------------------------------------------------------------------------
/use-cases/virgo/.gitignore:
--------------------------------------------------------------------------------
1 | data/


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | .[torch,docs]
2 | 


--------------------------------------------------------------------------------
/tutorials/ml-workflows/.gitignore:
--------------------------------------------------------------------------------
1 | *.yaml


--------------------------------------------------------------------------------
/use-cases/cyclones/requirements.txt:
--------------------------------------------------------------------------------
1 | gdown


--------------------------------------------------------------------------------
/.github/linters/.shellcheckrc:
--------------------------------------------------------------------------------
1 | disable=SC2148
2 | 


--------------------------------------------------------------------------------
/ci/.gitattributes:
--------------------------------------------------------------------------------
1 | /sdk/** linguist-generated
2 | 


--------------------------------------------------------------------------------
/use-cases/eurac/.gitignore:
--------------------------------------------------------------------------------
1 | tmp
2 | plots/
3 | 


--------------------------------------------------------------------------------
/use-cases/cyclones/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 | experiments


--------------------------------------------------------------------------------
/ci/.gitignore:
--------------------------------------------------------------------------------
1 | /sdk
2 | /.venv
3 | /**/__pycache__
4 | /.env
5 | 


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-tutorial-1-mnist/.gitignore:
--------------------------------------------------------------------------------
1 | MNIST
2 | 


--------------------------------------------------------------------------------
/env-files/torch/requirements/requirements.txt:
--------------------------------------------------------------------------------
1 | # Addtional requirements go here
2 | 


--------------------------------------------------------------------------------
/.github/linters/.isort.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | profile = black
3 | known_first_party = itwinai
4 | 


--------------------------------------------------------------------------------
/docs/tutorials/workflows/03-dag-workflows:
--------------------------------------------------------------------------------
1 | ../../../tutorials/ml-workflows/03-dag-workflows


--------------------------------------------------------------------------------
/use-cases/virgo/requirements.txt:
--------------------------------------------------------------------------------
1 | gwpy
2 | h5py
3 | pandas
4 | scikit-learn
5 | matplotlib
6 | 


--------------------------------------------------------------------------------
/docs/tutorials/workflows/01-pipeline-introduction:
--------------------------------------------------------------------------------
1 | ../../../tutorials/ml-workflows/01-pipeline-introduction/


--------------------------------------------------------------------------------
/docs/tutorials/workflows/02-pipeline-configuration:
--------------------------------------------------------------------------------
1 | ../../../tutorials/ml-workflows/02-pipeline-configuration/


--------------------------------------------------------------------------------
/use-cases/xtclim/src/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/use-cases/xtclim/src/.DS_Store


--------------------------------------------------------------------------------
/docs/getting-started/plugins.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../../tutorials/plugins/README.md
2 |    :parser: myst_parser.sphinx_
3 | 


--------------------------------------------------------------------------------
/.github/linters/.jscpd.json:
--------------------------------------------------------------------------------
1 | {
2 |     "threshold": 2.0,
3 |     "ignore": [
4 |         "**/itwinai/loggers.py"
5 |     ]
6 | }


--------------------------------------------------------------------------------
/docs/images/icon-itwinai-orange.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/icon-itwinai-orange.png


--------------------------------------------------------------------------------
/docs/images/icon-itwinai-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/icon-itwinai-white.png


--------------------------------------------------------------------------------
/use-cases/xtclim/outputs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/use-cases/xtclim/outputs/.DS_Store


--------------------------------------------------------------------------------
/docs/images/icon-itwinai-orange-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/icon-itwinai-orange-white.png


--------------------------------------------------------------------------------
/use-cases/xtclim/preprocessing/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/use-cases/xtclim/preprocessing/.DS_Store


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-tutorial-2-trainer-class/slurm_config.yaml:
--------------------------------------------------------------------------------
1 | training_cmd: "train.py"
2 | num_nodes: 2
3 | gpus_per_node: 4
4 | 


--------------------------------------------------------------------------------
/use-cases/3dgan/requirements.txt:
--------------------------------------------------------------------------------
1 | h5py>=3.7.0
2 | google>=3.0.0
3 | protobuf>=4.24.3
4 | gdown>=4.7.1
5 | # plotly>=5.18.0
6 | # kaleido>=0.2.1


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "tutorials/plugins"]
2 | 	path = tutorials/plugins
3 | 	url = https://github.com/interTwin-eu/itwinai-plugin-template/
4 | 


--------------------------------------------------------------------------------
/docs/how-it-works/loggers/figures/logger_fig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/loggers/figures/logger_fig.png


--------------------------------------------------------------------------------
/docs/how-it-works/workflows/figures/comp_Get.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/workflows/figures/comp_Get.png


--------------------------------------------------------------------------------
/docs/how-it-works/workflows/figures/comp_Adapt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/workflows/figures/comp_Adapt.png


--------------------------------------------------------------------------------
/docs/how-it-works/workflows/figures/comp_Proc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/workflows/figures/comp_Proc.png


--------------------------------------------------------------------------------
/docs/how-it-works/workflows/figures/comp_Save.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/workflows/figures/comp_Save.png


--------------------------------------------------------------------------------
/docs/how-it-works/workflows/figures/comp_Split.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/workflows/figures/comp_Split.png


--------------------------------------------------------------------------------
/docs/how-it-works/workflows/figures/comp_Train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/workflows/figures/comp_Train.png


--------------------------------------------------------------------------------
/docs/images/icon-itwinai-orange-black-subtitle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/icon-itwinai-orange-black-subtitle.png


--------------------------------------------------------------------------------
/docs/images/icon-itwinai-orange-white-subtitle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/icon-itwinai-orange-white-subtitle.png


--------------------------------------------------------------------------------
/docs/how-it-works/workflows/figures/Adapt_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/workflows/figures/Adapt_example.png


--------------------------------------------------------------------------------
/docs/how-it-works/workflows/figures/comp_Predict.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/workflows/figures/comp_Predict.png


--------------------------------------------------------------------------------
/docs/how-it-works/workflows/figures/simple_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/workflows/figures/simple_pipeline.png


--------------------------------------------------------------------------------
/docs/images/icon-itwinai-orange-black-subtitle-small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/icon-itwinai-orange-black-subtitle-small.png


--------------------------------------------------------------------------------
/docs/how-it-works/workflows/figures/Advanced_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/workflows/figures/Advanced_workflow.png


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-scaling-test/img/report.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/tutorials/distributed-ml/torch-scaling-test/img/report.png


--------------------------------------------------------------------------------
/use-cases/eurac/requirements.txt:
--------------------------------------------------------------------------------
1 | hython[complete] @ git+https://github.com/interTwin-eu/hython.git@main
2 | scikit-learn
3 | tqdm
4 | cf_xarray
5 | requests
6 | aiohttp
7 | 


--------------------------------------------------------------------------------
/docs/images/scalability-plots/mnist/absolute_epoch_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/mnist/absolute_epoch_time.png


--------------------------------------------------------------------------------
/docs/images/scalability-plots/virgo/absolute_epoch_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/virgo/absolute_epoch_time.png


--------------------------------------------------------------------------------
/tutorials/ml-workflows/03-dag-workflows/Advanced_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/tutorials/ml-workflows/03-dag-workflows/Advanced_workflow.png


--------------------------------------------------------------------------------
/.github/linters/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | # https://black.readthedocs.io/en/stable/guides/using_black_with_other_tools.html#flake8
3 | extend-ignore = E203,W503
4 | max-line-length = 95
5 | 


--------------------------------------------------------------------------------
/docs/images/scalability-plots/eurac/outdated/gpu_energy_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/eurac/outdated/gpu_energy_plot.png


--------------------------------------------------------------------------------
/docs/images/scalability-plots/eurac/outdated/utilization_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/eurac/outdated/utilization_plot.png


--------------------------------------------------------------------------------
/docs/images/scalability-plots/mnist/computation_vs_other_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/mnist/computation_vs_other_plot.png


--------------------------------------------------------------------------------
/docs/images/scalability-plots/virgo/computation_vs_other_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/virgo/computation_vs_other_plot.png


--------------------------------------------------------------------------------
/docs/images/scalability-plots/virgo/outdated/gpu_energy_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/virgo/outdated/gpu_energy_plot.png


--------------------------------------------------------------------------------
/docs/images/scalability-plots/virgo/outdated/utilization_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/virgo/outdated/utilization_plot.png


--------------------------------------------------------------------------------
/env-files/torch/jupyter/ipython_kernel_config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "IPKernelApp": {
3 |         "extensions": [
4 |             "rucio_jupyterlab.kernels.ipython"
5 |         ]
6 |     }
7 | }


--------------------------------------------------------------------------------
/use-cases/radio-astronomy/.gitignore:
--------------------------------------------------------------------------------
 1 | syn_payload/
 2 | syn_param/
 3 | models/*
 4 | scalability-metrics/
 5 | plots/*
 6 | outputs/
 7 | mllogs/
 8 | checkpoints/
 9 | __pycache__/
10 | 


--------------------------------------------------------------------------------
/docs/images/scalability-plots/eurac/outdated/communication_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/eurac/outdated/communication_plot.png


--------------------------------------------------------------------------------
/docs/images/scalability-plots/mnist/relative_epoch_time_speedup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/mnist/relative_epoch_time_speedup.png


--------------------------------------------------------------------------------
/docs/images/scalability-plots/virgo/outdated/communication_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/virgo/outdated/communication_plot.png


--------------------------------------------------------------------------------
/docs/images/scalability-plots/virgo/relative_epoch_time_speedup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/virgo/relative_epoch_time_speedup.png


--------------------------------------------------------------------------------
/env-files/torch/requirements/cmcc-requirements.txt:
--------------------------------------------------------------------------------
 1 | cartopy
 2 | joblib
 3 | lightning
 4 | matplotlib
 5 | munch
 6 | pandas
 7 | requests
 8 | tqdm
 9 | timm
10 | toml
11 | xarray
12 | zarr


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | extend-ignore = E203,W503
3 | max-line-length = 95
4 | exclude = .git,__pycache__,docs/conf.py,use-cases,tutorials,tests,old,build,dist,.venv*,envAI*,env-files,.vscode,ci
5 | 


--------------------------------------------------------------------------------
/tutorials/ml-workflows/01-pipeline-introduction/sample_pipeline_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/tutorials/ml-workflows/01-pipeline-introduction/sample_pipeline_1.jpg


--------------------------------------------------------------------------------
/tutorials/ml-workflows/01-pipeline-introduction/sample_pipeline_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/tutorials/ml-workflows/01-pipeline-introduction/sample_pipeline_2.jpg


--------------------------------------------------------------------------------
/COPYRIGHT:
--------------------------------------------------------------------------------
1 | This project is licensed under Apache-2.0.
2 | 
3 | Copyrights in this project are retained by their contributors.
4 | No copyright assignment is required to contribute to this project.
5 | 


--------------------------------------------------------------------------------
/docs/api/cli.md:
--------------------------------------------------------------------------------
1 | # itwinai CLI reference placeholder
2 | 
3 | Please overwrite this file before building the docs:
4 | 
5 | ```bash
6 | typer itwinai.cli  utils docs --output docs/api/cli.md
7 | ```
8 | 


--------------------------------------------------------------------------------
/env-files/torch/requirements/README.md:
--------------------------------------------------------------------------------
1 | # Additional requirements
2 | 
3 | This folder contains additional (optional) python dependencies, for instance
4 | interTwin use cases specific dependencies.
5 | 


--------------------------------------------------------------------------------
/docs/_static/custom.css:
--------------------------------------------------------------------------------
1 | /* Custom CSS for resizing the Sphinx logo */
2 | .logo img {
3 |     width: 150px;  /* Adjust the width as needed */
4 |     height: auto;  /* Maintain the aspect ratio */
5 | }
6 | 


--------------------------------------------------------------------------------
/docs/api/itwinai.loggers.rst:
--------------------------------------------------------------------------------
1 | itwinai.loggers
2 | ================
3 | 
4 | .. automodule:: itwinai.loggers
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 |    :member-order: bysource
9 | 


--------------------------------------------------------------------------------
/docs/images/scalability-plots/eurac/outdated/absolute_scalability_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/eurac/outdated/absolute_scalability_plot.png


--------------------------------------------------------------------------------
/docs/images/scalability-plots/eurac/outdated/relative_scalability_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/eurac/outdated/relative_scalability_plot.png


--------------------------------------------------------------------------------
/docs/images/scalability-plots/virgo/outdated/absolute_scalability_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/virgo/outdated/absolute_scalability_plot.png


--------------------------------------------------------------------------------
/docs/images/scalability-plots/virgo/outdated/relative_scalability_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/virgo/outdated/relative_scalability_plot.png


--------------------------------------------------------------------------------
/docs/api/itwinai.type.rst:
--------------------------------------------------------------------------------
 1 | itwinai.type
 2 | =============
 3 | 
 4 | .. automodule:: itwinai.type
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 |    :member-order: bysource
 9 | 
10 | 


--------------------------------------------------------------------------------
/docs/api/itwinai.utils.rst:
--------------------------------------------------------------------------------
 1 | itwinai.utils
 2 | =============
 3 | 
 4 | .. automodule:: itwinai.utils
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 |    :member-order: bysource
 9 | 
10 | 


--------------------------------------------------------------------------------
/docs/api/itwinai.parser.rst:
--------------------------------------------------------------------------------
 1 | itwinai.parser
 2 | ==============
 3 | 
 4 | .. automodule:: itwinai.parser
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 |    :member-order: bysource
 9 | 
10 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | ---
2 | version: 2
3 | updates:
4 |   # Maintain dependencies for GitHub Actions
5 |   - package-ecosystem: "github-actions"
6 |     directory: "/"
7 |     schedule:
8 |       interval: "daily"
9 | 


--------------------------------------------------------------------------------
/docs/api/itwinai.distributed.rst:
--------------------------------------------------------------------------------
1 | itwinai.distributed
2 | ===================
3 | 
4 | .. automodule:: itwinai.distributed
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 |    :member-order: bysource
9 | 


--------------------------------------------------------------------------------
/docs/api/itwinai.pipeline.rst:
--------------------------------------------------------------------------------
 1 | itwinai.pipeline
 2 | ================
 3 | 
 4 | .. automodule:: itwinai.pipeline
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 |    :member-order: bysource
 9 | 
10 | 


--------------------------------------------------------------------------------
/docs/api/itwinai.components.rst:
--------------------------------------------------------------------------------
 1 | itwinai.components
 2 | ==================
 3 | 
 4 | .. automodule:: itwinai.components
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 |    :member-order: bysource
 9 | 
10 | 


--------------------------------------------------------------------------------
/use-cases/xtclim/requirements.txt:
--------------------------------------------------------------------------------
 1 | cartopy
 2 | cftime
 3 | codecarbon
 4 | dask
 5 | datetime
 6 | imageio
 7 | ipykernel
 8 | matplotlib
 9 | numpy
10 | pandas
11 | tqdm
12 | urllib3==1.26.*
13 | xarray
14 | netCDF4
15 | h5netcdf
16 | 


--------------------------------------------------------------------------------
/docs/api/itwinai.serialization.rst:
--------------------------------------------------------------------------------
 1 | itwinai.serialization
 2 | =====================
 3 | 
 4 | .. automodule:: itwinai.serialization
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 |    :member-order: bysource
 9 | 
10 | 


--------------------------------------------------------------------------------
/.github/linters/.markdownlint.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "MD013": {
 3 |     "line_length": 120,
 4 |     "code_blocks": false,
 5 |     "tables": false
 6 |   },
 7 |   "MD014": false,
 8 |   "MD024": false,
 9 |   "MD026": {
10 |     "punctuation": ".,:;!"
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/env-files/torch/jupyter/start-cloud.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | echo "[start.sh] Running setup.sh for Rucio (generates rucio.cfg)..."
5 | /usr/local/bin/setup.sh
6 | 
7 | echo "[start.sh] Running original start.sh..."
8 | exec /usr/local/bin/start-original.sh "$@"
9 | 


--------------------------------------------------------------------------------
/use-cases/radio-astronomy/clean:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | rm -rf outputs checkpoints plots/* 
4 | rm models/trained_Filter_test_v0.pt 
5 | rm models/trained_CNN1D_test_v0.pt 
6 | rm models/trained_UNet_test_v0.pt
7 | rm -rf logs
8 | rm -rf mllogs ml-logs mlruns
9 | rm -f progress.out report.out


--------------------------------------------------------------------------------
/use-cases/mnist/tensorflow/README.md:
--------------------------------------------------------------------------------
 1 | # TensorFlow example on MNIST dataset
 2 | 
 3 | **Integration author(s)**: Roman Machacek (CERN), Matteo Bunino (CERN)
 4 | 
 5 | ## Training
 6 | 
 7 | ```bash
 8 | # Run the whole training pipeline
 9 | itwinai exec-pipeline +pipe_key=pipeline
10 | ```
11 | 


--------------------------------------------------------------------------------
/THIRD_PARTY_LICENSES:
--------------------------------------------------------------------------------
1 | The file `src/itwinai/flamegraph.pl` is from Brendan Gregg’s Flamegraph project
2 | (https://github.com/brendangregg/Flamegraph) and is licensed under the CDDL v1.0. It was
3 | copied unmodified on 2025-04-22.
4 | 
5 | See `licenses/CDDL-1.0.txt` for the full license text. 
6 | 


--------------------------------------------------------------------------------
/docs/use-cases/xtclim_doc.rst:
--------------------------------------------------------------------------------
1 | ML-based extreme events detection and characterization (xtclim, CERFACS)
2 | ========================================================================
3 |  
4 |  .. include:: ../../use-cases/xtclim/README.md
5 |     :parser: myst_parser.sphinx_
6 |     :start-line: 3
7 | 


--------------------------------------------------------------------------------
/use-cases/radio-astronomy/requirements.txt:
--------------------------------------------------------------------------------
1 | numpyencoder>=0.3.0
2 | pulsarrfi-nn @ git+https://gitlab.com/ml-ppa/pulsarrfi_nn.git@version_0.2#subdirectory=unet_semantic_segmentation
3 | pulsardt @ git+https://gitlab.com/ml-ppa/pulsardt@main
4 | ipywidgets
5 | pyqt6>=6.0
6 | pyquaternion>=0.9.9
7 | scikit-image>=0.22.0
8 | tqdm>=4.65.0
9 | 


--------------------------------------------------------------------------------
/ci/dagger.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "itwinai",
 3 |   "engineVersion": "v0.18.12",
 4 |   "sdk": {
 5 |     "source": "python"
 6 |   },
 7 |   "dependencies": [
 8 |     {
 9 |       "name": "k3s",
10 |       "source": "github.com/marcosnils/daggerverse/k3s@k3s/v0.1.10",
11 |       "pin": "28eea1fcf3b6ecb38a628186107760acd717442f"
12 |     }
13 |   ]
14 | }
15 | 


--------------------------------------------------------------------------------
/src/itwinai/plugins/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------------------------------------
2 | # Part of the interTwin Project: https://www.intertwin.eu/
3 | #
4 | # Created by: Matteo Bunino
5 | #
6 | # Credit:
7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
8 | # --------------------------------------------------------------------------------------
9 | 


--------------------------------------------------------------------------------
/src/itwinai/torch/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------------------------------------
2 | # Part of the interTwin Project: https://www.intertwin.eu/
3 | #
4 | # Created by: Matteo Bunino
5 | #
6 | # Credit:
7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
8 | # --------------------------------------------------------------------------------------
9 | 


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | All notable changes to this project will be documented in this file.
 4 | 
 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 6 | and this project adheres to
 7 | [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 8 | 
 9 | ## [Unreleased]
10 | 
11 | ## [X.X.XX]
12 | - Change description (#PR_NUMBER) (AUTHOR)
13 | 


--------------------------------------------------------------------------------
/docs/api/cli_reference.rst:
--------------------------------------------------------------------------------
 1 | CLI
 2 | ===
 3 | 
 4 | Here you can find the itwinai CLI reference.
 5 | 
 6 | .. cli_reference.md must be generated by typer using:
 7 | .. $ typer itwinai.cli  utils docs --output docs/api/cli.md
 8 | .. More info: https://typer.tiangolo.com/tutorial/package/#generate-docs
 9 | 
10 | .. include:: cli.md
11 |    :parser: myst_parser.sphinx_
12 |    :start-line: 2


--------------------------------------------------------------------------------
/src/itwinai/tensorflow/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------------------------------------
2 | # Part of the interTwin Project: https://www.intertwin.eu/
3 | #
4 | # Created by: Matteo Bunino
5 | #
6 | # Credit:
7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
8 | # --------------------------------------------------------------------------------------
9 | 


--------------------------------------------------------------------------------
/src/itwinai/torch/data/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------------------------------------
2 | # Part of the interTwin Project: https://www.intertwin.eu/
3 | #
4 | # Created by: Matteo Bunino
5 | #
6 | # Credit:
7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
8 | # --------------------------------------------------------------------------------------
9 | 


--------------------------------------------------------------------------------
/src/itwinai/torch/models/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------------------------------------
2 | # Part of the interTwin Project: https://www.intertwin.eu/
3 | #
4 | # Created by: Matteo Bunino
5 | #
6 | # Credit:
7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
8 | # --------------------------------------------------------------------------------------
9 | 


--------------------------------------------------------------------------------
/src/itwinai/tensorflow/data/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------------------------------------
2 | # Part of the interTwin Project: https://www.intertwin.eu/
3 | #
4 | # Created by: Matteo Bunino
5 | #
6 | # Credit:
7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
8 | # --------------------------------------------------------------------------------------
9 | 


--------------------------------------------------------------------------------
/src/itwinai/tensorflow/models/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------------------------------------
2 | # Part of the interTwin Project: https://www.intertwin.eu/
3 | #
4 | # Created by: Matteo Bunino
5 | #
6 | # Credit:
7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
8 | # --------------------------------------------------------------------------------------
9 | 


--------------------------------------------------------------------------------
/use-cases/radio-astronomy/.pytest-clean:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ###    THIS IS A CLEAN-UP SCRIPT FOR THE TEST SUITE    ###
 4 | ###     PLEASE DO NOT EDIT THIS FILE UNLESS WORKING    ###
 5 | ### ON THE TEST SUITE FOR THE RADIO-ASTRONOMY USE-CASE ### 
 6 | 
 7 | rm -rf outputs checkpoints plots/*
 8 | rm -rf logs
 9 | rm -rf mllogs ml-logs mlruns
10 | rm -f progress.out report.out
11 | rm -rf .test_dataset


--------------------------------------------------------------------------------
/docs/api/modules.rst:
--------------------------------------------------------------------------------
 1 | Python SDK 
 2 | ==========
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 4
 6 | 
 7 |    itwinai.components
 8 |    itwinai.distributed
 9 |    itwinai.loggers
10 |    itwinai.parser
11 |    itwinai.pipeline
12 |    itwinai.scalability_report.modules
13 |    itwinai.serialization
14 |    itwinai.tests.modules
15 |    itwinai.tf.modules
16 |    itwinai.torch.modules
17 |    itwinai.type
18 |    itwinai.utils
19 | 


--------------------------------------------------------------------------------
/docs/tutorials/distrib-ml/torch-tutorial-GAN.rst:
--------------------------------------------------------------------------------
 1 | GAN tutorial with PyTorch
 2 | =========================
 3 | 
 4 | .. include:: ../../../tutorials/distributed-ml/torch-tutorial-GAN/README.md
 5 |    :parser: myst_parser.sphinx_
 6 | 
 7 | 
 8 | Python files 
 9 | ------------------
10 | 
11 | train.py
12 | ++++++++++++
13 | 
14 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-GAN/train.py
15 |    :language: python
16 | 


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-scaling-test/config/ddp.yaml:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | backend: nccl


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-scaling-test/config/deepspeed.yaml:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | backend: nccl


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-tutorial-0-basics/slurm_config.yaml:
--------------------------------------------------------------------------------
 1 | account: intertwin
 2 | time: 00:20:00
 3 | partition: develbooster
 4 | 
 5 | dist_strat: ddp
 6 | std_out: slurm_job_logs/${dist_strat}.out
 7 | err_out: slurm_job_logs/${dist_strat}.err
 8 | job_name: ${dist_strat}-job
 9 | 
10 | python_venv: ../../../.venv/
11 | 
12 | num_nodes: 1
13 | gpus_per_node: 4
14 | cpus_per_task: 16
15 | 
16 | training_cmd: "train.py -s {dist_strat}"
17 | 


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-tutorial-GAN/slurm_config.yaml:
--------------------------------------------------------------------------------
 1 | account: intertwin
 2 | partition: develbooster
 3 | time: 00:20:00
 4 | 
 5 | dist_strat: ddp
 6 | std_out: slurm_job_logs/${dist_strat}.out
 7 | err_out: slurm_job_logs/${dist_strat}.err
 8 | job_name: ${dist_strat}-job
 9 | 
10 | python_venv: ../../../.venv/
11 | 
12 | num_nodes: 1
13 | gpus_per_node: 4
14 | cpus_per_task: 16
15 | 
16 | training_cmd: "train.py --strategy {dist_strat}"
17 | 


--------------------------------------------------------------------------------
/use-cases/mnist/torch/slurm_config.yaml:
--------------------------------------------------------------------------------
 1 | num_nodes: 1
 2 | gpus_per_node: 4
 3 | python_venv: ../../../.venv/
 4 | account: s24r05-03-users
 5 | partition: gpu
 6 | 
 7 | dist_strat: ddp
 8 | pipe_key: training_pipeline
 9 | 
10 | py_spy: false
11 | profiling_sampling_rate: 100
12 | 
13 | training_cmd: "$(which itwinai) exec-pipeline \
14 |   strategy={dist_strat} \
15 |   checkpoints_location=checkpoints_{dist_strat} \
16 |   +pipe_key={pipe_key}"
17 | 


--------------------------------------------------------------------------------
/docs/tutorials/distrib-ml/torch_tutorial_0_basics.rst:
--------------------------------------------------------------------------------
 1 | Introduction to distributed ML with PyTorch
 2 | ===============================================
 3 | 
 4 | .. include:: ../../../tutorials/distributed-ml/torch-tutorial-0-basics/README.md
 5 |    :parser: myst_parser.sphinx_
 6 |    :start-line: 2
 7 | 
 8 | train.py
 9 | ++++++++
10 | 
11 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-0-basics/train.py
12 |    :language: python
13 | 


--------------------------------------------------------------------------------
/docs/tutorials/workflows/04_itwinai_argparser.rst:
--------------------------------------------------------------------------------
 1 | Integrating configuration with command line arguments
 2 | =========================================================
 3 | 
 4 | 
 5 | .. include:: ../../../tutorials/ml-workflows/04-itwinai-argparser/README.md
 6 |    :parser: myst_parser.sphinx_
 7 | 
 8 | 
 9 | main.py
10 | ---------
11 | 
12 | .. literalinclude:: ../../../tutorials/ml-workflows/04-itwinai-argparser/main.py
13 |    :language: python
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-tutorial-1-mnist/slurm_config.yaml:
--------------------------------------------------------------------------------
 1 | account: intertwin
 2 | partition: develbooster
 3 | time: 00:20:00
 4 | 
 5 | dist_strat: ddp
 6 | std_out: slurm_job_logs/${dist_strat}.out
 7 | err_out: slurm_job_logs/${dist_strat}.err
 8 | job_name: ${dist_strat}-job
 9 | 
10 | python_venv: ../../../.venv/
11 | 
12 | num_nodes: 1
13 | gpus_per_node: 4
14 | cpus_per_task: 16
15 | 
16 | training_cmd: "train.py -s {dist_strat} -c config.yaml"
17 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | A good PR should describe what benefit this brings to the repository.
 3 | Ideally, there is an existing issue which the PR address.
 4 | 
 5 | Please check the Contributing guide CONTRIBUTING.md for style requirements and
 6 | advice.
 7 | -->
 8 | 
 9 | # Summary
10 | 
11 | <!-- Describe in plain English what this PR does -->
12 | 
13 | ---
14 | 
15 | <!-- Add, if any, the related issue here, e.g. #6 -->
16 | 
17 | **Related issue :**
18 | 


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-scaling-test/config/horovod.yaml:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | fp16_allreduce: False
11 | use_adasum: False
12 | gradient_predivide_factor: 1.0


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | # https://help.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners
 2 | # https://github.blog/2017-07-06-introducing-code-owners/
 3 | 
 4 | # Assign code owners that will automatically get asked to review Pull Requests
 5 | # The last matching pattern takes the most precedence.
 6 | 
 7 | # These owners will be the default owners for everything in the repo.
 8 | # Unless a later match takes precedence, they will be requested for
 9 | # review when someone opens a pull request.
10 | 
11 | * @matbun


--------------------------------------------------------------------------------
/use-cases/xtclim/src/initialization.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | # Mean-Squared Error as the average difference between the pixels
 5 | # in the original image vs. the reconstructed one
 6 | criterion = nn.MSELoss()
 7 | # pixel-wise MSE loss
 8 | pixel_wise_criterion = nn.MSELoss(reduction='none')
 9 | 
10 | # KL divergence handles dispersion of information in latent space
11 | # a balance is to be found with the prevailing reconstruction error
12 | beta = 0.1
13 | 
14 | # number of evaluations for each dataset
15 | n_avg = 20
16 | 


--------------------------------------------------------------------------------
/ci/src/main/literals.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | import dagger
11 | from dagger import enum_type
12 | 
13 | 
14 | @enum_type
15 | class MLFramework(dagger.Enum):
16 |     TORCH = "TORCH"
17 |     TENSORFLOW = "TENSORFLOW"
18 | 


--------------------------------------------------------------------------------
/src/itwinai/tests/exceptions.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | """Custom exceptions raised during sanity checks for itwinai."""
11 | 
12 | 
13 | class SanityCheckError(Exception):
14 |     """Base exception for all sanity check errors."""
15 | 


--------------------------------------------------------------------------------
/tutorials/ml-workflows/04-itwinai-argparser/README.md:
--------------------------------------------------------------------------------
 1 | # itwinai ArgumentParser
 2 | 
 3 | **Author(s)**: Matteo Bunino
 4 | 
 5 | itwinai provides a wrapper of jsonarparse's ArgumentParser which supports
 6 | configuration files by default.
 7 | 
 8 | To run as usual:
 9 | 
10 | ```bash
11 | python main.py -d 20 --train-prop 0.7 --val-prop 0.2 --lr 1e-5
12 | ```
13 | 
14 | To reuse the parameters saved in a configuration file and override some
15 | parameter (e.g., learning rate):
16 | 
17 | ```bash
18 | python main.py --config advanced_tutorial_conf.yaml --lr 2e-3
19 | ```
20 | 


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "recommendations": [
 3 |         "ms-python.flake8",
 4 |         "streetsidesoftware.code-spell-checker",
 5 |         "njpwerner.autodocstring",
 6 |         "dlyz.md-link-checker",
 7 |         "davidanson.vscode-markdownlint",
 8 |         "ms-python.vscode-pylance",
 9 |         "ms-python.python",
10 |         "bierner.markdown-mermaid",
11 |         "tamasfe.even-better-toml",
12 |         "charliermarsh.ruff",
13 |         "github.vscode-github-actions",
14 |         "dnut.rewrap-revived",
15 |         "emilast.logfilehighlighter"
16 |     ]
17 | }


--------------------------------------------------------------------------------
/docs/tutorials/distrib-ml/tf_tutorial_1_imagenet.rst:
--------------------------------------------------------------------------------
 1 | Tensorflow ImageNet example
 2 | ===========================
 3 | 
 4 | .. include:: ../../../tutorials/distributed-ml/tf-tutorial-1-imagenet/README.md
 5 |    :parser: myst_parser.sphinx_
 6 | 
 7 | train.py
 8 | ++++++++
 9 | 
10 | .. literalinclude:: ../../../tutorials/distributed-ml/tf-tutorial-1-imagenet/train.py
11 |    :language: python
12 | 
13 | 
14 | tfmirrored_slurm.sh
15 | +++++++++++++++++++
16 | 
17 | .. literalinclude:: ../../../tutorials/distributed-ml/tf-tutorial-1-imagenet/tfmirrored_slurm.sh
18 |    :language: bash
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/tutorials/distrib-ml/torch_tutorial_1_mnist.rst:
--------------------------------------------------------------------------------
 1 | Distributed training on MNIST dataset
 2 | ==========================================
 3 | 
 4 | .. include:: ../../../tutorials/distributed-ml/torch-tutorial-1-mnist/README.md
 5 |    :parser: myst_parser.sphinx_
 6 |    :start-line: 2
 7 | 
 8 | train.py
 9 | ++++++++
10 | 
11 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-1-mnist/train.py
12 |    :language: python
13 | 
14 | config.yaml
15 | +++++++++++
16 | 
17 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-1-mnist/config.yaml
18 |    :language: yaml
19 | 


--------------------------------------------------------------------------------
/use-cases/mnist/torch-lightning/README.md:
--------------------------------------------------------------------------------
 1 | # Torch Lightning example on MNIST dataset
 2 | 
 3 | **Integration author(s)**: Matteo Bunino (CERN)
 4 | 
 5 | ## Training
 6 | 
 7 | ```bash
 8 | # Download dataset and exit: only run first step in the pipeline (index=0)
 9 | itwinai exec-pipeline +pipe_key=training_pipeline +pipe_steps=[0]
10 | 
11 | # Run the whole training pipeline
12 | itwinai exec-pipeline +pipe_key=training_pipeline 
13 | ```
14 | 
15 | View training logs on MLFLow server (if activated from the configuration):
16 | 
17 | ```bash
18 | mlflow ui --backend-store-uri mllogs/mlflow/
19 | ```
20 | 


--------------------------------------------------------------------------------
/use-cases/cyclones/src/strategy.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | # gets the mirrored strategy based on whether or not we are running the model
 5 | # with CPU or GPU
 6 | def get_mirrored_strategy(cores=4):
 7 |     if cores:
 8 |         CPUs = ['CPU:'+str(i) for i in range(cores)]
 9 |         mirrored_strategy = tf.distribute.MirroredStrategy(CPUs)
10 |     else:
11 |         mirrored_strategy = tf.distribute.MirroredStrategy()
12 | 
13 |     print('Number of devices: {}'.format(
14 |         mirrored_strategy.num_replicas_in_sync))
15 | 
16 |     return mirrored_strategy, mirrored_strategy.num_replicas_in_sync
17 | 


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-tutorial-2-trainer-class/README.md:
--------------------------------------------------------------------------------
 1 | # Tutorial on itwinai TorchTrainer for MNIST use case
 2 | 
 3 | **Author(s)**: Matteo Bunino (CERN)
 4 | 
 5 | The code is adapted from [this example](https://github.com/pytorch/examples/blob/main/mnist/main.py).
 6 | 
 7 | ## Run the script
 8 | 
 9 | ```bash
10 | python train.py
11 | 
12 | # With distributed training (interactive)
13 | torchrun --standalone --nnodes=1 --nproc-per-node=gpu train.py --strategy ddp
14 | ```
15 | 
16 | ## Analyze the logs
17 | 
18 | Analyze the logs with MLFlow:
19 | 
20 | ```bash
21 | itwinai mlflow-ui --path mllogs/mlflow
22 | ```
23 | 


--------------------------------------------------------------------------------
/.github/linters/mlc_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "httpHeaders": [
 3 |     {
 4 |       "urls": [
 5 |         "https://docs.github.com/"
 6 |       ],
 7 |       "headers": {
 8 |         "Accept-Encoding": "zstd, br, gzip, deflate"
 9 |       }
10 |     }
11 |   ],
12 |   "ignorePatterns": [
13 |     {
14 |       "pattern": "^http://localhost"
15 |     },
16 |     {
17 |       "pattern": "^https://example.com"
18 |     },
19 |     {
20 |       "pattern": "docs/"
21 |     },
22 |     {
23 |       "pattern": "./"
24 |     },
25 |     {
26 |       "pattern": "../"
27 |     },
28 |     {
29 |       "pattern": "use-cases/"
30 |     }
31 |   ]
32 | }


--------------------------------------------------------------------------------
/docs/tutorials/distrib-ml/tf_tutorial_0_basics.rst:
--------------------------------------------------------------------------------
 1 | Introduction on distributed training with TensorFlow
 2 | ===========================================================
 3 | 
 4 | .. include:: ../../../tutorials/distributed-ml/tf-tutorial-0-basics/README.md
 5 |    :parser: myst_parser.sphinx_
 6 | 
 7 | train.py
 8 | ++++++++
 9 | 
10 | .. literalinclude:: ../../../tutorials/distributed-ml/tf-tutorial-0-basics/train.py
11 |    :language: python
12 | 
13 | 
14 | tfmirrored_slurm.sh
15 | +++++++++++++++++++
16 | 
17 | .. literalinclude:: ../../../tutorials/distributed-ml/tf-tutorial-0-basics/tfmirrored_slurm.sh
18 |    :language: bash
19 | 
20 | 


--------------------------------------------------------------------------------
/env-files/torch/horovod-deepspeed-JSC.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Installation for JSC
 4 | 
 5 | # Job configuration
 6 | #SBATCH --job-name=setup_venv
 7 | #SBATCH --account=intertwin
 8 | #SBATCH --output=horovod_ds_installation.out
 9 | #SBATCH --error=horovod_ds_installation.err
10 | #SBATCH --time=00:30:00
11 | 
12 | # Resource allocation
13 | #SBATCH --partition=develbooster
14 | #SBATCH --nodes=1
15 | #SBATCH --gres=gpu
16 | 
17 | ml --force purge
18 | ml Stages/2025 GCC OpenMPI CUDA/12 cuDNN MPI-settings/CUDA
19 | ml Python CMake HDF5 PnetCDF libaio mpi4py git
20 | 
21 | source .venv/bin/activate
22 | source env-files/torch/install-horovod-deepspeed-cuda.sh
23 | 


--------------------------------------------------------------------------------
/docs/api/itwinai.tf.modules.rst:
--------------------------------------------------------------------------------
 1 | itwinai.tensorflow
 2 | ==================
 3 | 
 4 | distributed.py
 5 | ++++++++++++++
 6 | 
 7 | .. automodule:: itwinai.tensorflow.distributed
 8 |    :members:
 9 |    :undoc-members:
10 |    :show-inheritance:
11 |    :member-order: bysource
12 | 
13 | 
14 | trainer.py
15 | +++++++++++
16 | 
17 | .. automodule:: itwinai.tensorflow.trainer
18 |    :members:
19 |    :undoc-members:
20 |    :show-inheritance:
21 |    :member-order: bysource
22 | 
23 | 
24 | utils.py
25 | ++++++++
26 | 
27 | .. automodule:: itwinai.tensorflow.utils
28 |    :members:
29 |    :undoc-members:
30 |    :show-inheritance:
31 |    :member-order: bysource
32 | 
33 | 


--------------------------------------------------------------------------------
/docs/api/itwinai.tests.modules.rst:
--------------------------------------------------------------------------------
 1 | itwinai.tests
 2 | =============
 3 | 
 4 | 
 5 | dummy_components.py
 6 | +++++++++++++++++++
 7 | 
 8 | .. automodule:: itwinai.tests.dummy_components
 9 |    :members:
10 |    :undoc-members:
11 |    :show-inheritance:
12 |    :member-order: bysource
13 | 
14 | 
15 | exceptions.py
16 | +++++++++++++
17 | 
18 | .. automodule:: itwinai.tests.exceptions
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 |    :member-order: bysource
23 | 
24 | 
25 | sanity_check.py
26 | +++++++++++++++
27 | 
28 | .. automodule:: itwinai.tests.sanity_check
29 |    :members:
30 |    :undoc-members:
31 |    :show-inheritance:
32 |    :member-order: bysource
33 | 
34 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/use-cases/mnist/torch/Dockerfile:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | # Find more base image candidates under:
11 | # - https://github.com/interTwin-eu/itwinai/pkgs/container/itwinai
12 | # - https://github.com/interTwin-eu/itwinai/pkgs/container/itwinai-dev
13 | FROM ghcr.io/intertwin-eu/itwinai:torch-skinny-latest
14 | 
15 | # Add torch MNIST use case
16 | COPY use-cases/mnist/torch/* ./
17 | 


--------------------------------------------------------------------------------
/src/itwinai/type.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | """Framework-independent types."""
11 | 
12 | 
13 | class MLArtifact:
14 |     """A framework-independent machine learning artifact."""
15 | 
16 | 
17 | class MLDataset(MLArtifact):
18 |     """A framework-independent machine learning dataset."""
19 | 
20 | 
21 | class MLModel(MLArtifact):
22 |     """A framework-independent machine learning model."""
23 | 


--------------------------------------------------------------------------------
/tutorials/distributed-ml/tf-tutorial-0-basics/README.md:
--------------------------------------------------------------------------------
 1 | # Tutorial: distributed strategies for Tensorflow
 2 | 
 3 | In this tutorial we show how to use Tensorflow `MultiWorkerMirroredStrategy`.
 4 | Note that the environment is tested on the HDFML system at JSC.
 5 | For other systems, the module versions might need change accordingly.
 6 | Other strategies will be updated here.
 7 | 
 8 | First, from the root of this repository, build the environment containing
 9 | Tensorflow. You can *try* with:
10 | 
11 | ```bash
12 | # Creates a Python venv called envAItf_hdfml
13 | make tf-gpu-jsc
14 | ```
15 | 
16 | If you want to distribute the code in `train.py`, run from terminal:
17 | 
18 | ```bash
19 | sbatch tfmirrored_slurm.sh
20 | ```
21 | 


--------------------------------------------------------------------------------
/tutorials/distributed-ml/tf-tutorial-1-imagenet/README.md:
--------------------------------------------------------------------------------
 1 | # Tutorial: distributed strategies for Tensorflow
 2 | 
 3 | In this tutorial we show how to use Tensorflow `MultiWorkerMirroredStrategy`.
 4 | Note that the environment is tested on the HDFML system at JSC.
 5 | For other systems, the module versions might need change accordingly.
 6 | Other strategies will be updated here.
 7 | 
 8 | First, from the root of this repository, build the environment containing
 9 | Tensorflow. You can *try* with:
10 | 
11 | ```bash
12 | # Creates a Python venv called envAItf_hdfml
13 | make tf-gpu-jsc
14 | ```
15 | 
16 | If you want to distribute the code in `train.py`, run from terminal:
17 | 
18 | ```bash
19 | sbatch tfmirrored_slurm.sh
20 | ```
21 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?= -W -v
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/tutorials/distrib-ml/torch_tutorial_kubeflow_1.rst:
--------------------------------------------------------------------------------
 1 | Tutorial on Kubeflow and TorchTrainer class
 2 | ===========================================
 3 | 
 4 | .. include:: ../../../tutorials/distributed-ml/torch-kubeflow-1/README.md
 5 |    :parser: myst_parser.sphinx_
 6 | 
 7 | 
 8 | train-cpu.py
 9 | ++++++++++++
10 | 
11 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-kubeflow-1/train-cpu.py
12 |    :language: python
13 | 
14 | 
15 | cpu.yaml
16 | ++++++++
17 | 
18 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-kubeflow-1/cpu.yaml
19 |    :language: yaml
20 | 
21 | Dockerfile
22 | ++++++++++
23 | 
24 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-kubeflow-1/Dockerfile
25 |    :language: dockerfile
26 | 


--------------------------------------------------------------------------------
/.github/linters/.hadolint.yaml:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | failure-threshold: warning
11 | ignored:
12 |   - DL3008 # Pin versions in apt get install.
13 |   - DL3013 # Pin versions in pip. TODO: remove.
14 |   - DL4001 # Either use Wget or Curl but not both
15 |   - DL3003 # Use WORKDIR to switch to a directory
16 |   - DL3006 # Always tag the version of an image explicitly: https://github.com/hadolint/hadolint/issues/339


--------------------------------------------------------------------------------
/env-files/docs/build-docs-jsc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # --------------------------------------------------------------------------------------
 4 | # Part of the interTwin Project: https://www.intertwin.eu/
 5 | #
 6 | # Created by: Matteo Bunino
 7 | #
 8 | # Credit:
 9 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
10 | # - Anna Lappe <anna.elisa.lappe@cern.ch> - CERN
11 | # --------------------------------------------------------------------------------------
12 | 
13 | # Build the documentation locally and serve it on localhost on JSC systems
14 | 
15 | ml --force purge
16 | ml Stages/2023  GCCcore/.11.3.0 Python/3.10.4 Pandoc/2.19.2
17 | 
18 | source .venv-docs/bin/activate
19 | cd docs
20 | make clean && make html && python -m http.server -d _build/html


--------------------------------------------------------------------------------
/docs/use-cases/use_cases.rst:
--------------------------------------------------------------------------------
 1 | Introduction
 2 | ============
 3 | 
 4 | Each use case comes with their own tutorial on how to run it. Before running them,
 5 | however, you should set up a Python virtual environment.
 6 | 
 7 | After installing and activating the virtual environment, you will want to install the
 8 | use-case specific dependencies, if applicable. This can be done by first ``cd``-ing
 9 | into the use-case directory and then installing the requirements, as follows
10 | 
11 | .. code-block:: bash
12 | 
13 |    cd use-cases/<name-of-use-case>
14 |    pip install -r requirements.txt
15 | 
16 | 
17 | Alternatively, you can use the use-case Docker image, if available. After setting
18 | everything up, you can now run the use case as specified in the use case's tutorial.
19 | 


--------------------------------------------------------------------------------
/env-files/docs/create-docs-env-jsc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # --------------------------------------------------------------------------------------
 4 | # Part of the interTwin Project: https://www.intertwin.eu/
 5 | #
 6 | # Created by: Matteo Bunino
 7 | #
 8 | # Credit:
 9 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
10 | # --------------------------------------------------------------------------------------
11 | 
12 | # Create .venv-docs virtualenv to build the documentation locally on JSC systems
13 | 
14 | ml --force purge
15 | ml Stages/2023  GCCcore/.11.3.0 Python/3.10.4 Pandoc/2.19.2
16 | 
17 | cmake --version
18 | gcc --version
19 | 
20 | rm -rf .venv-docs
21 | python -m venv .venv-docs
22 | source .venv-docs/bin/activate
23 | 
24 | pip install -r docs/requirements.txt


--------------------------------------------------------------------------------
/docs/tutorials/distrib-ml/tf_scaling_test.rst:
--------------------------------------------------------------------------------
 1 | Tensorflow scaling test
 2 | =======================
 3 | 
 4 | .. include:: ../../../tutorials/distributed-ml/tf-scaling-test-jube/README.md
 5 |    :parser: myst_parser.sphinx_
 6 | 
 7 | 
 8 | train.py
 9 | ++++++++
10 | 
11 | .. literalinclude:: ../../../tutorials/distributed-ml/tf-scaling-test-jube/train.py
12 |    :language: python
13 | 
14 | 
15 | jube_ddp.sh
16 | +++++++++++
17 | 
18 | .. literalinclude:: ../../../tutorials/distributed-ml/tf-scaling-test-jube/jube_ddp.sh
19 |    :language: bash
20 | 
21 | 
22 | .. TODO: improve notebook rendering
23 | 
24 | .. bench_plot.ipynb
25 | .. ++++++++++++++++
26 | 
27 | .. .. literalinclude:: ../../../tutorials/distributed-ml/tf-scaling-test-jube/bench_plot.ipynb
28 | ..    :language: python
29 | 


--------------------------------------------------------------------------------
/docs/use-cases/mnist_doc.rst:
--------------------------------------------------------------------------------
 1 | MNIST dataset
 2 | =============
 3 | 
 4 | This section covers the MNIST use case. This use case has been implemented using three
 5 | different strategies, ``TensorFlow``, ``PyTorch`` and ``PyTorch Lightning``. You can 
 6 | find the files relevant to this use case
 7 | in the `use case's folder on Github <https://github.com/interTwin-eu/itwinai/tree/main/use-cases/mnist>`_.
 8 | 
 9 | For more information on each implementation, consult their respective READMEs:
10 | 
11 | Torch Lightning
12 | ---------------
13 | 
14 | .. include:: ../../use-cases/mnist/torch-lightning/README.md
15 |    :parser: myst_parser.sphinx_
16 |    :start-line: 2
17 | 
18 | 
19 | PyTorch
20 | -------
21 | 
22 | .. include:: ../../use-cases/mnist/torch/README.md
23 |    :parser: myst_parser.sphinx_
24 |    :start-line: 2
25 | 


--------------------------------------------------------------------------------
/docs/use-cases/latticeqcd_doc.rst:
--------------------------------------------------------------------------------
 1 | Normalizing flow for generating lattice field configurations (Lattice QCD, ETHZ/CSIC)
 2 | =====================================================================================
 3 | 
 4 | The code is adapted from `this notebook <https://github.com/jkomijani/normflow_>`_ from the Lattice QCD use case.
 5 | 
 6 | More information on the use case is available in the published deliverables,
 7 | `D4.2 <https://zenodo.org/records/10417138>`_,
 8 | `D7.2 <https://zenodo.org/records/10417161>`_ and `D7.4 <https://zenodo.org/records/10224277>`_.
 9 | 
10 | 
11 | About the use-case and integration
12 | ----------------------------------
13 | .. include:: ../../use-cases/lattice-qcd/README.md
14 |    :parser: myst_parser.sphinx_
15 |    :start-after: <!-- sphinx-start -->
16 |    :end-before: <!-- sphinx-end -->
17 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | 
 3 | from itwinai.cli import generate_slurm
 4 | from itwinai.slurm.utils import get_slurm_job_parser
 5 | 
 6 | 
 7 | def test_cli_slurm_function_signature():
 8 |     """Test that function signature in cli.py matches argparser"""
 9 |     args = inspect.getfullargspec(generate_slurm).args
10 |     parser = get_slurm_job_parser()
11 | 
12 |     ignored_args = ["print_config", "help"]
13 |     parser_args = {arg.dest for arg in parser._actions}
14 |     parser_args -= set(ignored_args)
15 | 
16 |     missing_in_function = parser_args - set(args)
17 |     missing_in_parser = set(args) - parser_args
18 | 
19 |     assert not missing_in_function and not missing_in_parser, (
20 |         f"Arguments missing in function: {missing_in_function}, "
21 |         f"Arguments missing in parser: {missing_in_parser}"
22 |     )
23 | 


--------------------------------------------------------------------------------
/MAINTAINERS.md:
--------------------------------------------------------------------------------
 1 | # Authors
 2 | 
 3 | ## Mantainers
 4 | 
 5 | - Matteo Bunino - CERN - matteo.bunino\<at\>cern.ch
 6 | - Jarl Sondre Saether - CERN - jarl.sondre.saether\<at\>cern.ch
 7 | - Linus Eickhoff - CERN - linus.maximilian.eickhoff\<at\>cern.ch
 8 | - Anna Elisa Lappe - CERN - anna.elisa.lappe\<at\>cern.ch
 9 | - Rakesh Sarma - FZJ - r.sarma\<at\>fz-juelich.de
10 | 
11 | ## Contributors
12 | 
13 | - Kalliopi Tsolaki - CERN - kalliopi.tsolaki\<at\>cern.ch
14 | - Killian Verder - CERN - killian.verder\<at\>cern.ch
15 | - Henry Mutegeki - CERN - henry.mutegeki\<at\>cern.ch
16 | - Roman Machacek - CERN - roman.machacek\<at\>cern.ch
17 | - Alexander Zoechbauer - CERN - alexander.zoechbauer\<at\>cern.ch
18 | - Mario Ruettgers - FZJ - m.ruettgers\<at\>fz-juelich.de
19 | 
20 | [Full contributors list](https://github.com/interTwin-eu/itwinai/graphs/contributors)
21 | 


--------------------------------------------------------------------------------
/docs/api/itwinai.scalability_report.modules.rst:
--------------------------------------------------------------------------------
 1 | itwinai.scalability_report
 2 | ==========================
 3 | 
 4 | 
 5 | data.py
 6 | +++++++
 7 | .. automodule:: itwinai.scalability_report.data
 8 |    :members:
 9 |    :undoc-members:
10 |    :show-inheritance:
11 |    :member-order: bysource
12 | 
13 | 
14 | plot.py
15 | +++++++
16 | .. automodule:: itwinai.scalability_report.plot
17 |    :members:
18 |    :undoc-members:
19 |    :show-inheritance:
20 |    :member-order: bysource
21 | 
22 | 
23 | reports.py
24 | ++++++++++
25 | .. automodule:: itwinai.scalability_report.reports
26 |    :members:
27 |    :undoc-members:
28 |    :show-inheritance:
29 |    :member-order: bysource
30 | 
31 | 
32 | utils.py
33 | ++++++++
34 | .. automodule:: itwinai.scalability_report.utils
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 |    :member-order: bysource
39 | 


--------------------------------------------------------------------------------
/docs/use-cases/cyclones_doc.rst:
--------------------------------------------------------------------------------
 1 | Tropical Cyclones Detection (CMCC)
 2 | ==================================
 3 | 
 4 | The code is adapted from the CMCC use case's
 5 | `repository <https://github.com/CMCC-Foundation/ml-tropical-cyclones-detection>`_ and refers
 6 | to a TensorFLow implementation.
 7 | To know more on the interTwin tropical cyclones detection use case and its DT, please 
 8 | visit the published deliverables, `D4.1 <https://zenodo.org/records/10417135>`_, 
 9 | `D7.1 <https://zenodo.org/records/10417158>`_ and 
10 | `D7.3 <https://zenodo.org/records/10224252>`_.
11 | You can find the relevant code in the
12 | `use case's folder on Github <https://github.com/interTwin-eu/itwinai/tree/main/use-cases/cyclones>`_,
13 | or by consulting the use case's README: 
14 | 
15 | .. include:: ../../use-cases/cyclones/README.md
16 |    :parser: myst_parser.sphinx_
17 |    :start-line: 2
18 | 


--------------------------------------------------------------------------------
/src/itwinai/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | from .dummy_components import (
11 |     FakeGetter,
12 |     FakeGetterExec,
13 |     FakePreproc,
14 |     FakePreprocExec,
15 |     FakeSaver,
16 |     FakeSaverExec,
17 |     FakeSplitter,
18 |     FakeSplitterExec,
19 |     FakeTrainer,
20 |     FakeTrainerExec,
21 | )
22 | 
23 | _ = (
24 |     FakeGetter,
25 |     FakeGetterExec,
26 |     FakePreproc,
27 |     FakePreprocExec,
28 |     FakeSaver,
29 |     FakeSaverExec,
30 |     FakeSplitter,
31 |     FakeSplitterExec,
32 |     FakeTrainer,
33 |     FakeTrainerExec,
34 | )
35 | 


--------------------------------------------------------------------------------
/src/itwinai/torch/type.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | """Custom types definition."""
11 | 
12 | from typing import Callable
13 | 
14 | import torch
15 | 
16 | #: Torch data batch sampled by a ``DataLoader``.
17 | Batch = torch.Tensor
18 | 
19 | #: Torch metric function provided by ``torchmetrics`` library.
20 | Metric = Callable
21 | 
22 | 
23 | class UninitializedStrategyError(Exception):
24 |     """Error raised when a strategy has not been initialized."""
25 | 
26 | 
27 | class DistributedStrategyError(Exception):
28 |     """Error raised when a strategy has already been initialized."""
29 | 


--------------------------------------------------------------------------------
/use-cases/virgo/synthetic-data-gen/data_generation_hdf5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --account=intertwin
 4 | #SBATCH --output=array-job/job_%a.out
 5 | #SBATCH --error=array-job/job_%a.err
 6 | #SBATCH --time=00:07:00
 7 | #SBATCH --mem-per-cpu=1G
 8 | #SBATCH --partition=develbooster
 9 | #SBATCH --array=1-75
10 | #SBATCH --job-name=generate_virgo_data
11 | #SBATCH --cpus-per-task=26
12 | 
13 | # Load required modules
14 | ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py
15 | 
16 | # Activate Python virtual environment
17 | source ../../envAI_juwels/bin/activate
18 | 
19 | # Folder in which the datasets will be stored
20 | target_file="/p/scratch/intertwin/datasets/virgo_hdf5/virgo_data_${SLURM_ARRAY_TASK_ID}.hdf5"
21 | 
22 | python synthetic-data-gen/file_gen_hdf5.py \
23 |   --num-datapoints 10000 \
24 |   --num-processes 25 \
25 |   --save-frequency 1000 \
26 |   --save-location "$target_file"
27 | 
28 | 


--------------------------------------------------------------------------------
/use-cases/3dgan/Dockerfile:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | FROM nvcr.io/nvidia/pytorch:23.09-py3
11 | 
12 | WORKDIR /usr/src/app
13 | 
14 | # Install itwinai
15 | COPY pyproject.toml ./
16 | COPY src ./
17 | RUN pip install --upgrade pip \
18 |     && pip install --no-cache-dir lightning \
19 |     && pip install --no-cache-dir .
20 | 
21 | # Add 3DGAN use case files and install additional requirements
22 | COPY use-cases/3dgan/requirements.txt ./
23 | COPY use-cases/3dgan/* ./
24 | RUN pip install --no-cache-dir -r requirements.txt
25 | 
26 | # ENTRYPOINT [ "itwinai", "exec-pipeline" ]
27 | # CMD [ "--config", "pipeline.yaml" ]


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/use-cases/xtclim/train.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Train file to launch pipeline
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | from itwinai.parser import ConfigParser
 8 | from itwinai.utils import load_yaml
 9 | 
10 | sys.path.append(os.path.join(os.path.dirname(__file__), "src"))
11 | sys.path.append(os.path.join(os.path.dirname(__file__), "preprocessing"))
12 | 
13 | 
14 | if __name__ == "__main__":
15 | 
16 |     config = load_yaml('pipeline.yaml')
17 |     seasons_list = config['seasons']
18 | 
19 |     for season in seasons_list:
20 |         model_uri = f"outputs/cvae_model_{season}1d_1memb.pth"
21 |         override_dict = {
22 |             'season': season,
23 |             'model_uri': model_uri
24 |         }
25 |         pipe_parser = ConfigParser(
26 |             config=config,
27 |             override_keys=override_dict
28 |         )
29 |         pipeline = pipe_parser.parse_pipeline()
30 | 
31 |         print(f"Running pipeline for season: {season}")
32 |         pipeline.execute()


--------------------------------------------------------------------------------
/env-files/tensorflow/createEnvVegaTF.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # --------------------------------------------------------------------------------------
 5 | # Part of the interTwin Project: https://www.intertwin.eu/
 6 | #
 7 | # Created by: Matteo Bunino
 8 | #
 9 | # Credit:
10 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
11 | # --------------------------------------------------------------------------------------
12 | 
13 | if [ ! -f "env-files/tensorflow/generic_tf.sh" ]; then
14 |   echo "ERROR: env-files/tensorflow/generic_tf.sh not found!"
15 |   exit 1
16 | fi
17 | 
18 | # Load modules
19 | # NOTE: REFLECT THEM IN THE MAIN README! 
20 | ml --force purge
21 | ml Python 
22 | ml CMake/3.24.3-GCCcore-11.3.0
23 | ml mpi4py
24 | ml OpenMPI
25 | ml CUDA/11.7
26 | ml GCCcore/11.3.0
27 | ml NCCL/2.12.12-GCCcore-11.3.0-CUDA-11.7.0
28 | ml cuDNN
29 | 
30 | 
31 | # Create and install torch env
32 | export ENV_NAME=".venv-tf"
33 | bash env-files/tensorflow/generic_tf.sh


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-tutorial-1-mnist/config.yaml:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | # Data and logging
11 | data_dir: ./
12 | log_int: 10
13 | verbose: True
14 | restart_int: 10
15 | download_only: False
16 | dataset_replication: 10
17 | shuff: False
18 | nworker: 4 # num workers dataloader
19 | prefetch: 2
20 | 
21 | # Model
22 | batch_size: 64
23 | epochs: 2
24 | lr: 0.001
25 | momentum: 0.5
26 | 
27 | # Reproducibility
28 | rnd_seed: 10
29 | 
30 | # Distributed ML
31 | backend: nccl # ignored when using Horovod
32 | 
33 | # Horovod: ignored when NOT using Horovod
34 | fp16_allreduce: False
35 | use_adasum: False
36 | gradient_predivide_factor: 1.0
37 | 
38 | 


--------------------------------------------------------------------------------
/use-cases/mnist/torch-lightning/startscript:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # general configuration of the job
 4 | #SBATCH --job-name=PrototypeTest
 5 | #SBATCH --account=intertwin
 6 | #SBATCH -o logs_slurm/job-2.out
 7 | #SBATCH -e logs_slurm/job-2.err
 8 | #SBATCH --time=00:30:00
 9 | 
10 | # configure node and process count on the CM
11 | #SBATCH --partition=develbooster
12 | #SBATCH --nodes=1
13 | #SBATCH --ntasks-per-node=4
14 | #SBATCH --cpus-per-task=4
15 | #SBATCH --gpus-per-node=4
16 | 
17 | #SBATCH --exclusive
18 | 
19 | # gres options have to be disabled for deepv
20 | #SBATCH --gres=gpu:4
21 | 
22 | # load modules
23 | ml --force purge
24 | ml Stages/2024 GCC OpenMPI CUDA/12 cuDNN 
25 | ml Python CMake HDF5 PnetCDF libaio
26 | 
27 | # activate environment
28 | source ../../../envAI_juwels/bin/activate
29 | 
30 | # ON LOGIN NODE download datasets:
31 | # ../../../.venv-pytorch/bin/itwinai exec-pipeline +pipe_key=training_pipeline +pipe_steps=[dataloading_step]
32 | 
33 | srun itwinai exec-pipeline +pipe_steps=[1]


--------------------------------------------------------------------------------
/.github/workflows/check-links.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Check links
 3 | 
 4 | on:
 5 |   push:
 6 |   pull_request:
 7 | 
 8 | jobs:
 9 |   markdown-link-check:
10 |     name: Check links using markdown-link-check
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |       # Checks out a copy of your repository on the ubuntu-latest machine
15 |       - name: Checkout code
16 |         uses: actions/checkout@v6
17 |         with:
18 |           # Make sure the actual branch is checked out when running on PR
19 |           # ref: ${{ github.event.pull_request.head.sha }}
20 |           # Full git history needed to get proper list of changed files
21 |           fetch-depth: 0
22 | 
23 |       - name: Check links on new changes
24 |         uses: gaurav-nelson/github-action-markdown-link-check@v1
25 |         with:
26 |           config-file: ".github/linters/mlc_config.json"
27 |           check-modified-files-only: "yes"
28 |           use-quiet-mode: "yes"
29 |           use-verbose-mode: "yes"
30 |           base-branch: "main"
31 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/env-files/torch/jupyter/README.md:
--------------------------------------------------------------------------------
 1 | # JupyterLab image for itwinai with Rucio client
 2 | 
 3 | The files in this folder are adapted from the work done by
 4 | the [VRE team](https://github.com/vre-hub/environments).
 5 | 
 6 | To build this container, go into the root of itwinai and run
 7 | 
 8 | ```bash
 9 | docker build -t <IMG>:<TAG> -f env-files/torch/jupyter/Dockerfile .
10 | ```
11 | 
12 | using your preferred `<IMG>` and `<TAG>`.
13 | 
14 | ## Install custom dependencies
15 | 
16 | To install custom dependencies (e.g., use cases packages) you can add them
17 | in a `requirements.txt` file, add it somewhere **in the itwinai directory** and pass
18 | it to the `docker build`:
19 | 
20 | ```bash
21 | docker build -t <IMG>:<TAG> -f env-files/torch/jupyter/Dockerfile \
22 |     --build-arg REQUIREMENTS=path/to/requirements.txt .
23 | ```
24 | 
25 | For instance:
26 | 
27 | ```bash
28 | docker build -t <IMG>:<TAG> -f env-files/torch/jupyter/Dockerfile \
29 |     --build-arg REQUIREMENTS=env-files/torch/requirements/cmcc-requirements.txt .
30 | ```
31 | 


--------------------------------------------------------------------------------
/docs/tutorials/distrib-ml/kuberay-setup-tutorial.rst:
--------------------------------------------------------------------------------
 1 | Distributed Machine Learning on HPC from k8s using KubeRay operator and interLink
 2 | =================================================================================
 3 | 
 4 | .. include:: ../../../tutorials/distributed-ml/kuberay-setup-tutorial/README.md
 5 |    :parser: myst_parser.sphinx_
 6 | 
 7 | 
 8 | raycluster_example.yaml
 9 | +++++++++++++++++++++++
10 | 
11 | This file defines the RayCluster, the file is referenced in the tutorial as the values file
12 | used by the KubeRay operator to deploy Ray
13 | clusters on Kubernetes.
14 | It specifies the configuration for head and worker nodes, including resource requests,
15 | environment variables, and startup commands.
16 | For a full reference of supported fields and structure, see the
17 | `Ray on Kubernetes config documentation <https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/config.html>`_
18 | 
19 | 
20 | .. literalinclude:: ../../../tutorials/distributed-ml/kuberay-setup-tutorial/raycluster_example.yaml
21 |    :language: yaml
22 | 


--------------------------------------------------------------------------------
/docs/tutorials/distrib-ml/torch_scaling_test.rst:
--------------------------------------------------------------------------------
 1 | PyTorch scaling test
 2 | ====================
 3 | 
 4 | .. include:: ../../../tutorials/distributed-ml/torch-scaling-test/README.md
 5 |    :parser: myst_parser.sphinx_
 6 | 
 7 | 
 8 | Plots of the scalability metrics
 9 | --------------------------------
10 | 
11 | We have the following scalability metrics available: 
12 | 
13 | - Absolute wall-clock time comparison
14 | - Relative wall-clock time speedup
15 | - Computation vs. Other time
16 | - Communication vs. Computation time (deprecated)
17 | - GPU Utilization (%)
18 | - Power Consumption (Watt)
19 | 
20 | You can see example plots of these in the 
21 | :doc:`Virgo documentation <../../use-cases/virgo_doc>` or the 
22 | :doc:`EURAC documentation <../../use-cases/eurac_doc>`.
23 | 
24 | Additionally, we ran a larger scalability test with this tutorial on the full ImageNet
25 | dataset with the older script. This only shows the relative speedup and can be seen here:
26 | 
27 | .. image:: ../../../tutorials/distributed-ml/torch-scaling-test/img/report.png
28 | 
29 | 


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-scaling-test/config/base.yaml:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | # Data and logging
11 | data_dir: /p/scratch/intertwin/datasets/imagenet/ILSVRC2012/train/ # tmp_data/
12 | epoch_time_directory: scalability-metrics/epoch-time
13 | 
14 | # Subset size can be an int or None. Cannot be larger than the length of the dataset. 
15 | # If you wish to set it to "None", you must use "null" as that is what yaml expects
16 | subset_size: 5000 
17 | log_int: 10
18 | 
19 | # verbose: True
20 | nworker: 4 # num workers dataloader
21 | prefetch: 2
22 | 
23 | # Model
24 | batch_size: 64 # micro batch size
25 | epochs: 10
26 | lr: 0.001
27 | momentum: 0.5
28 | shuff: False
29 | 
30 | # Reproducibility
31 | rnd_seed: 10
32 | 


--------------------------------------------------------------------------------
/use-cases/3dgan/create_inference_sample.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | """Create a simple inference dataset sample and a checkpoint."""
11 | 
12 | import argparse
13 | import os
14 | 
15 | import torch
16 | from model import ThreeDGAN
17 | 
18 | 
19 | def create_checkpoint(root: str = ".", ckpt_name: str = "3dgan-inference.pth"):
20 |     ckpt_path = os.path.join(root, ckpt_name)
21 |     net = ThreeDGAN()
22 |     torch.save(net, ckpt_path)
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     parser = argparse.ArgumentParser()
27 |     parser.add_argument("--root", type=str, default=".")
28 |     parser.add_argument("--ckpt-name", type=str, default="3dgan-inference.pth")
29 |     args = parser.parse_args()
30 |     create_checkpoint(**vars(args))
31 | 


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-tutorial-2-trainer-class/sample_srun.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # --------------------------------------------------------------------------------------
 4 | # Part of the interTwin Project: https://www.intertwin.eu/
 5 | #
 6 | # Created by: Jarl Sondre Sæther
 7 | #
 8 | # Credit:
 9 | # - Jarl Sondre Sæther <jarl.sondre.saether@cern.ch> - CERN
10 | # --------------------------------------------------------------------------------------
11 | # This file contains the sample bash code that was used in the interTwin presentation
12 | # held on Feb. 18. It is meant to illustrate how to combine srun and torchrun to launch
13 | # processes in parallel that can communicate and thus facilitate distributed ML. 
14 | 
15 | srun --cpu-bind=none --ntasks-per-node=1 \
16 |     bash -c "torchrun \
17 |     --nnodes=2 \
18 |     --nproc_per_node=4 \
19 |     --rdzv_id=151152 \
20 |     --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \
21 |     --rdzv_backend=c10d \
22 |     --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \
23 |     python train.py"
24 | 


--------------------------------------------------------------------------------
/env-files/torch/createEnvVega.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # --------------------------------------------------------------------------------------
 5 | # Part of the interTwin Project: https://www.intertwin.eu/
 6 | #
 7 | # Created by: Matteo Bunino
 8 | #
 9 | # Credit:
10 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
11 | # --------------------------------------------------------------------------------------
12 | 
13 | if [ ! -f "env-files/torch/generic_torch.sh" ]; then
14 |   echo "ERROR: env-files/torch/generic_torch.sh not found!"
15 |   exit 1
16 | fi
17 | 
18 | # Load modules
19 | # NOTE: REFLECT THEM IN THE MAIN README! 
20 | ml --force purge
21 | ml CMake/3.29.3-GCCcore-13.3.0
22 | ml mpi4py/3.1.5
23 | ml OpenMPI/4.1.6-GCC-13.2.0
24 | ml cuDNN/8.9.7.29-CUDA-12.3.0
25 | ml CUDA/12.6.0
26 | ml NCCL/2.22.3-GCCcore-13.3.0-CUDA-12.6.0
27 | ml Python/3.12.3-GCCcore-13.3.0
28 | 
29 | # You should have CUDA 12.6 now
30 | 
31 | 
32 | # Create and install torch env
33 | export ENV_NAME=".venv-pytorch"
34 | export PIP_INDEX_TORCH_CUDA="https://download.pytorch.org/whl/cu126"
35 | bash env-files/torch/generic_torch.sh
36 | 


--------------------------------------------------------------------------------
/src/itwinai/tensorflow/utils.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | 
11 | import json
12 | 
13 | import keras
14 | 
15 | 
16 | def model_to_json(model: keras.Model, filepath: str):
17 |     """Serialize Keras model to JSON file.
18 | 
19 |     Args:
20 |         model (keras.Model): Keras model.
21 |         filepath (str): JSON file path.
22 |     """
23 |     with open(filepath, "w") as f:
24 |         json.dump(model.to_json(), f)
25 | 
26 | 
27 | def model_from_json(filepath: str) -> keras.Model:
28 |     """Deserialize Keras model from JSON file.
29 | 
30 |     Args:
31 |         filepath (str): JSON file path.
32 | 
33 |     Returns:
34 |         keras.Model: loaded Keras model.
35 |     """
36 |     with open(filepath, "r") as f:
37 |         config = json.load(f)
38 |         return keras.models.model_from_json(config)
39 | 


--------------------------------------------------------------------------------
/env-files/torch/jupyter/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | python /opt/setup-rucio-jupyterlab/configure.py
 4 | 
 5 | # Creation of the rucio.cfg file
 6 | mkdir -p /certs /tmp;
 7 | echo -n $RUCIO_ACCESS_TOKEN > /tmp/rucio_oauth.token;
 8 | # mkdir -p /opt/rucio/etc;
 9 | # echo "[client]" >> /opt/rucio/etc/rucio.cfg;
10 | # echo "rucio_host = https://rucio-intertwin-testbed.desy.de" >> /opt/rucio/etc/rucio.cfg;
11 | # echo "auth_host = https://rucio-intertwin-testbed-auth.desy.de" >> /opt/rucio/etc/rucio.cfg;
12 | # #echo "ca_cert = /certs/rucio_ca.pem" >> /opt/rucio/etc/rucio.cfg;
13 | # echo "ca_cert = /opt/conda/lib/python3.9/site-packages/certifi/cacert.pem" >> /opt/rucio/etc/rucio.cfg;
14 | # echo "account = $JUPYTERHUB_USER" >> /opt/rucio/etc/rucio.cfg;
15 | # echo "auth_type = oidc" >> /opt/rucio/etc/rucio.cfg;
16 | # echo "oidc_audience = rucio-testbed" >> /opt/rucio/etc/rucio.cfg;
17 | # echo "oidc_polling = true" >> /opt/rucio/etc/rucio.cfg;
18 | # echo "oidc_scope = openid profile offline_access eduperson_entitlement" >> /opt/rucio/etc/rucio.cfg;
19 | # echo "auth_token_file_path = /tmp/rucio_oauth.token" >> /opt/rucio/etc/rucio.cfg;
20 | 
21 | exec "$@"


--------------------------------------------------------------------------------
/ci/src/main/utils.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | 
11 | def get_codename(release_info: str) -> str:
12 |     """
13 |     Extracts the codename (VERSION_CODENAME or os_version) from release information.
14 | 
15 |     Args:
16 |         release_info (str): The string containing the output of /etc/*-release.
17 | 
18 |     Returns:
19 |         str: The extracted codename (e.g., "jammy" or "bookworm").
20 |     """
21 |     # Create a dictionary from the release info
22 |     release_dict = {}
23 |     for line in release_info.splitlines():
24 |         if "=" in line:
25 |             key, value = line.split("=", 1)
26 |             release_dict[key.strip()] = value.strip().strip('"')
27 | 
28 |     # Attempt to extract the codename
29 |     return release_dict.get("VERSION_CODENAME", release_dict.get("os_version", "Unknown"))
30 | 


--------------------------------------------------------------------------------
/use-cases/3dgan/downsample_h5py_file.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | """Downsample H5 files to a more manageable size."""
11 | 
12 | import h5py
13 | 
14 | IN_FILENAME = "large_file.h5"
15 | OUT_FILENAME = "sample.h5"
16 | MAXITEMS = 100
17 | 
18 | with h5py.File(IN_FILENAME, "r") as input_file:
19 |     with h5py.File(OUT_FILENAME, "w") as outfile:
20 |         for key in input_file.keys():
21 |             print(input_file[key])
22 |             shape = list(input_file[key].shape)
23 |             shape[0] = MAXITEMS
24 |             outfile.create_dataset_like(name=key, other=input_file[key], shape=tuple(shape))
25 |             print(outfile[key])
26 |             outfile[key][...] = input_file[key][:MAXITEMS]
27 | 
28 |         print("verify similarities")
29 |         print(input_file["energy"][:10])
30 |         print(outfile["energy"][:10])
31 | 


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-tutorial-containers/model.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | 
11 | import torch.nn.functional as F
12 | from torch import nn
13 | 
14 | 
15 | class Net(nn.Module):
16 |     def __init__(self):
17 |         super(Net, self).__init__()
18 |         self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
19 |         self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
20 |         self.conv2_drop = nn.Dropout2d()
21 |         self.fc1 = nn.Linear(320, 50)
22 |         self.fc2 = nn.Linear(50, 10)
23 | 
24 |     def forward(self, x):
25 |         x = F.relu(F.max_pool2d(self.conv1(x), 2))
26 |         x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
27 |         x = x.view(-1, 320)
28 |         x = F.relu(self.fc1(x))
29 |         x = F.dropout(x, training=self.training)
30 |         x = self.fc2(x)
31 |         return F.log_softmax(x, dim=0)
32 | 


--------------------------------------------------------------------------------
/src/itwinai/slurm/sample_slurm_config.yaml:
--------------------------------------------------------------------------------
 1 | job_name: my_slurm_job
 2 | 
 3 | account: intertwin
 4 | partition: develbooster
 5 | 
 6 | # HH:MM:SS
 7 | time: 00:11:11
 8 | 
 9 | # Keep in mind that these will be overwritten if "mode" is not "single", and that
10 | # if you override the dist_strat in the CLI, then these will already have evaluated
11 | # and thus might not correspond. Thus, we suggest you only change the dist_strat in
12 | # the config and avoid overriding it in the CLI.
13 | std_out: slurm_job_logs/${dist_strat}-${num_nodes}x${gpus_per_node}.out
14 | err_out: slurm_job_logs/${dist_strat}-${num_nodes}x${gpus_per_node}.err
15 | 
16 | num_nodes: 1
17 | gpus_per_node: 4
18 | cpus_per_task: 16
19 | memory: 16G
20 | 
21 | # The distributed strategy can be "ddp", "deepspeed" or "horovod"
22 | dist_strat: ddp
23 | python_venv: .venv
24 | exp_name: my_experiment
25 | run_name: my_run
26 | exclusive: False
27 | 
28 | # Make sure the below strategy matches the one above
29 | training_cmd: | 
30 |   $(which itwinai) exec-pipeline \
31 |   --config config.yaml \
32 |   --pipe-key rnn_training_pipeline \
33 |   strategy={dist_strat} \
34 |   experiment_name={experiment_name} \
35 |   run_name={run_name}
36 | 


--------------------------------------------------------------------------------
/use-cases/README.md:
--------------------------------------------------------------------------------
 1 | # interTwin use cases integrated into itwinai
 2 | 
 3 | Show how `itwinai` can be used to support scientific use cases. Each use case folder contains:
 4 | 
 5 | - A YAML configuration file describing the ML workflows for that use case.
 6 | - A SLURM job script, used to execute the ML workflows on a SLURM-based cluster.
 7 | - `requirements.txt`: (optional) use case-specific requirements. can be installed with:
 8 |   
 9 |   ```bash
10 |   cd use/case/folder
11 |   # After activating the correct environment...
12 |   pip install -r requirements.txt
13 |   ```
14 | 
15 | ## How to run a use case
16 | 
17 | First, create the use case's Python environment (i.e., PyTorch or TensorFlow)
18 | as described [in the main README](../README.md#environment-setup), and activate it.
19 | Then, install use case-specific dependencies, if any:
20 | 
21 | ```bash
22 | pip install -r /use/case/path/requirements.txt
23 | ```
24 | 
25 | Alternatively, you can use the use case Docker image, if available.
26 | 
27 | Then, go to the use case's directory:
28 | 
29 | ```bash
30 | cd use/case/path
31 | ```
32 | 
33 | From there you can run the use case following the instruction provided in the use case's folder.
34 | 


--------------------------------------------------------------------------------
/tests/torch/test_config.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | import pytest
11 | from pydantic import ValidationError
12 | 
13 | from itwinai.torch.config import TrainingConfiguration
14 | 
15 | 
16 | def test_values_parsing():
17 |     """Check dynamic override and creation of new entries."""
18 |     cfg = TrainingConfiguration(batch_size="11", param_abc="11", param_xyz=1.1)
19 |     assert cfg.batch_size == 11
20 |     assert cfg.param_abc == "11"
21 |     assert cfg.param_xyz == 1.1
22 |     assert isinstance(cfg.pin_gpu_memory, bool)
23 | 
24 |     # Check dict-like getitem
25 |     assert cfg["batch_size"] == 11
26 | 
27 | 
28 | def test_illegal_override():
29 |     """Test that illegal type override fails."""
30 |     with pytest.raises(ValidationError) as exc_info:
31 |         TrainingConfiguration(batch_size="hello")
32 |     assert "batch_size" in str(exc_info.value)
33 | 


--------------------------------------------------------------------------------
/src/itwinai/slurm/slurm_constants.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Jarl Sondre Sæther
 5 | #
 6 | # Credit:
 7 | # - Jarl Sondre Sæther <jarl.sondre.saether@cern.ch> - CERN
 8 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 9 | # --------------------------------------------------------------------------------------
10 | 
11 | DEFAULT_SLURM_LOG_DIR = "slurm-job-logs"
12 | DEFAULT_SLURM_SAVE_DIR = "slurm-scripts"
13 | DEFAULT_PY_SPY_DIR = "py-spy-output"
14 | SLURM_TEMPLATE = r"""#!/bin/bash
15 | 
16 | # Job configuration
17 | #SBATCH --job-name={job_name}
18 | #SBATCH --account={account}
19 | #SBATCH --partition={partition}
20 | #SBATCH --time={time}
21 | 
22 | #SBATCH --output={std_out}
23 | #SBATCH --error={err_out}
24 | 
25 | # Resource allocation
26 | #SBATCH --nodes={num_nodes}
27 | #SBATCH --ntasks-per-node={num_tasks_per_node}
28 | #SBATCH --cpus-per-task={cpus_per_task}
29 | #SBATCH --gpus-per-node={gpus_per_node}
30 | #SBATCH --gres=gpu:{gpus_per_node}
31 | #SBATCH --mem={memory}
32 | {exclusive_line}
33 | 
34 | {pre_exec_command}
35 | 
36 | {exec_command}"""
37 | 


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-tutorial-containers/run_docker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # --------------------------------------------------------------------------------------
 4 | # Part of the interTwin Project: https://www.intertwin.eu/
 5 | #
 6 | # Created by: Matteo Bunino
 7 | #
 8 | # Credit:
 9 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
10 | # --------------------------------------------------------------------------------------
11 | 
12 | CMD="itwinai exec-pipeline"
13 | 
14 | # Run command in the itwinai torch Docker container
15 | if [ -z "$1" ]; then
16 |     # CPU only execution
17 |     docker run -it --rm --name mnist-training --user $UID:$GID \
18 |         --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
19 |         -v "$PWD":/use-case  ghcr.io/intertwin-eu/itwinai:0.2.2-torch-2.1 \
20 |         /bin/bash -c "cd /use-case && $CMD"
21 | elif [ "$1" == "gpu" ]; then
22 |     # With GPU support: --gpus all
23 |     docker run -it --rm --name mnist-training --user $UID:$GID \
24 |         --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
25 |         -v "$PWD":/use-case  ghcr.io/intertwin-eu/itwinai:0.2.2-torch-2.1 \
26 |         /bin/bash -c "cd /use-case && $CMD"
27 | fi
28 | 


--------------------------------------------------------------------------------
/tests/run_on_jsc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # --------------------------------------------------------------------------------------
 4 | # Part of the interTwin Project: https://www.intertwin.eu/
 5 | #
 6 | # Created by: Matteo Bunino
 7 | #
 8 | # Credit:
 9 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
10 | # --------------------------------------------------------------------------------------
11 | 
12 | # Run tests on JSC environment
13 | # Set TORCH_ENV and TF_ENV variables below to use different
14 | # virtual environment names.
15 | 
16 | ml --force purge
17 | ml Stages/2024 GCC OpenMPI CUDA/12 cuDNN MPI-settings/CUDA
18 | ml Python CMake HDF5 PnetCDF libaio
19 | 
20 | export TORCH_ENV="envAI_hdfml"
21 | export TF_ENV="envAItf_hdfml"
22 | 
23 | if [ ! -d "$TORCH_ENV" ]; then
24 |   echo "$TORCH_ENV not found!"
25 |   exit 1
26 | fi
27 | if [ ! -d "$TF_ENV" ]; then
28 |   echo "$TF_ENV not found!"
29 |   exit 1
30 | fi
31 | 
32 | # Avoid downloading datasets from Gdrive
33 | export CERN_DATASET="/p/project1/intertwin/smalldata/3dgan-sample"
34 | export CMCCC_DATASET="/p/project1/intertwin/smalldata/cmcc"
35 | export MNIST_DATASET="/p/project1/intertwin/smalldata/mnist"
36 | 
37 | $TORCH_ENV/bin/pytest -v tests/ -m "not slurm"


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | import os
11 | from pathlib import Path
12 | 
13 | import pytest
14 | 
15 | 
16 | @pytest.fixture
17 | def torch_env() -> str:
18 |     """If TORCH_ENV env variable is defined, it overrides the default
19 |     torch virtual environment name. Otherwise, fall back
20 |     to './.venv-pytorch'.
21 | 
22 |     Returns absolute path to torch virtual environment.
23 |     """
24 |     env_path = Path(os.environ.get("TORCH_ENV", "./.venv-pytorch"))
25 |     return str(env_path.resolve())
26 | 
27 | 
28 | @pytest.fixture
29 | def tf_env() -> str:
30 |     """If TF_ENV env variable is defined, it overrides the default
31 |     torch virtual environment name. Otherwise, fall back
32 |     to './.venv-tf'.
33 | 
34 |     Returns absolute path to torch virtual environment.
35 |     """
36 |     env_path = Path(os.environ.get("TF_ENV", "./.venv-tf"))
37 |     return str(env_path.resolve())
38 | 


--------------------------------------------------------------------------------
/ci/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | [project]
11 | name = "main"
12 | version = "0.1.0"
13 | maintainers = [{ name = "Matteo Bunino", email = "matteo.bunino@cern.ch" }]
14 | authors = [{ name = "Matteo Bunino", email = "matteo.bunino@cern.ch" }]
15 | requires-python = ">=3.12"
16 | dependencies = [
17 |     "dagger-io",
18 |     "pyyaml>=6.0.2",
19 |     "ruff>=0.7.3",
20 | ]
21 | 
22 | [tool.uv.sources]
23 | dagger-io = { path = "sdk", editable = true }
24 | 
25 | [build-system]
26 | requires = ["hatchling==1.25.0"]
27 | build-backend = "hatchling.build"
28 | 
29 | # Ruff configuration: https://docs.astral.sh/ruff/configuration/
30 | [tool.ruff]
31 | line-length = 95
32 | 
33 | [tool.ruff.lint]
34 | select = ["E", "F", "I", "W"]
35 | ignore = ["E203"]
36 | fixable = ["ALL"]
37 | 
38 | [tool.ruff.format]
39 | quote-style = "double"
40 | indent-style = "space"
41 | skip-magic-trailing-comma = false
42 | line-ending = "auto"
43 | 


--------------------------------------------------------------------------------
/.github/workflows/sqaaas.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright contributors to the Software Quality Assurance as a Service (SQAaaS) project.
 2 | #
 3 | # SPDX-License-Identifier: GPL-3.0-only
 4 | ---
 5 | name: SQAaaS
 6 | 
 7 | on:
 8 |   push:
 9 |     branches: [main]
10 |   # pull_request:
11 |   #   branches: [main, dev]
12 | 
13 | jobs:
14 |   sqaaas_job:
15 |     runs-on: ubuntu-latest
16 |     name: Job that triggers SQAaaS platform
17 |     steps:
18 |       - name: Step definition for validating the workflow
19 |         uses: eosc-synergy/sqaaas-step-action@v1
20 |         with:
21 |           name: workflow_validation_step
22 |           tool: commands
23 | 
24 |           # Skipping tensorflow tests: make tensorflow-env-cpu
25 |           commands: |
26 |             make torch-env-cpu
27 |             .venv-pytorch/bin/pytest -v ./tests/ --disable-warnings -n logical --dist loadfile -m "not hpc and not memory_heavy and not tensorflow"
28 |           container: eoscsynergy/sqaaas-micromamba:1.5.3-1-rc.8
29 |       - name: Print out payload
30 |         run: cat workflow_validation_step.json
31 |       - name: SQAaaS assessment with unit testing (QC.Uni) step
32 |         uses: eosc-synergy/sqaaas-assessment-action@v2
33 |         with:
34 |           qc_uni_steps: workflow_validation_step
35 | 


--------------------------------------------------------------------------------
/use-cases/cyclones/src/transform.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | def coo_rot180(data):
 5 |     X, y = data
 6 |     patch_size = X.shape[0]
 7 |     X = tf.image.rot90(X, k=2)
 8 |     y1 = [-1., -1.]
 9 |     if y[0] != -1:
10 |         y1 = [-y[0] + patch_size - 1, -y[1] + patch_size - 1]
11 |     return (X, y1)
12 | 
13 | 
14 | def coo_left_right(data):
15 |     X, y = data
16 |     patch_size = X.shape[0]
17 |     X = tf.image.flip_left_right(X)
18 |     y1 = [-1., -1.]
19 |     if y[0] != -1:
20 |         y1 = [y[0], - y[1] + patch_size - 1]
21 |     return (X, y1)
22 | 
23 | 
24 | def coo_up_down(data):
25 |     X, y = data
26 |     patch_size = X.shape[0]
27 |     X = tf.image.flip_up_down(X)
28 |     y1 = [-1., -1.]
29 |     if y[0] != -1:
30 |         y1 = [- y[0] + patch_size - 1, y[1]]
31 |     return (X, y1)
32 | 
33 | 
34 | def msk_rot180(data):
35 |     X, Y = data
36 |     X = tf.image.rot90(X, k=2)
37 |     Y = tf.image.rot90(Y, k=2)
38 |     return (X, Y)
39 | 
40 | 
41 | def msk_left_right(data):
42 |     X, Y = data
43 |     X = tf.image.flip_left_right(X)
44 |     Y = tf.image.flip_left_right(Y)
45 |     return (X, Y)
46 | 
47 | 
48 | def msk_up_down(data):
49 |     X, Y = data
50 |     X = tf.image.flip_up_down(X)
51 |     Y = tf.image.flip_up_down(Y)
52 |     return (X, Y)
53 | 


--------------------------------------------------------------------------------
/use-cases/xtclim/src/utils.py:
--------------------------------------------------------------------------------
 1 | import imageio
 2 | import numpy as np
 3 | import torchvision.transforms as transforms
 4 | import matplotlib.pyplot as plt
 5 | from torchvision.utils import save_image
 6 | 
 7 | to_pil_image = transforms.ToPILImage()
 8 | 
 9 | def image_to_vid(images):
10 |     # save evolving images along the learning and get the video
11 |     imgs = [np.array(to_pil_image(img)) for img in images]
12 |     imageio.mimsave('outputs/generated_images.gif', imgs)
13 | 
14 | def save_reconstructed_images(recon_images, epoch, season = ''):
15 |     # save all reconstructed images at each epoch
16 |     save_image(recon_images.cpu(), f"outputs/image_record/{season}output{epoch}.jpg")
17 | 
18 | def save_ex(recon_ex, epoch, season = ''):
19 |     # save an example of image at a given epoch
20 |     save_image(recon_ex.cpu(), f"outputs/image_record/{season}ex{epoch}.jpg")
21 | 
22 | def save_loss_plot(train_loss, valid_loss, season = ''):
23 |     # saves the plot of both losses evolutions
24 |     plt.figure(figsize=(10, 7))
25 |     plt.plot(train_loss, color='orange', label='train loss')
26 |     plt.plot(valid_loss, color='red', label='validation loss')
27 |     plt.xlabel('Epochs')
28 |     plt.ylabel('Loss')
29 |     plt.legend()
30 |     plt.savefig(f'outputs/{season}loss.jpg')
31 |     plt.show()
32 | 


--------------------------------------------------------------------------------
/use-cases/lattice-qcd/train.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from normflow import Model, Fitter
 3 | from normflow.nn import DistConvertor_
 4 | from normflow.action import ScalarPhi4Action
 5 | from normflow.prior import NormalPrior
 6 | 
 7 | def make_model():
 8 |     net_ = DistConvertor_(10, symmetric=True)
 9 |     prior = NormalPrior(shape=(1,))
10 |     action = ScalarPhi4Action(kappa=0, m_sq=-1.2, lambd=0.5)
11 | 
12 |     return Model(net_=net_, prior=prior, action=action)
13 | 
14 | def fit_func(model, n_epochs=100, strategy='ddp'):
15 |     """Training function to fit model."""
16 | 
17 |     config = {
18 |         "optim_lr": 0.001,
19 |         "weight_decay": 0.01,
20 |         "ckpt_disp": False,
21 |         "batch_size": 128,
22 |         "save_every": "None",
23 |         "optimizer_class": "torch.optim.AdamW",
24 |         "scheduler": "None",
25 |         "loss_fn": "None",
26 |         "print_stride": 10,
27 |         "print_batch_size": 1024,
28 |         "snapshot_path": None,
29 |         "epochs_run": 0
30 |     }
31 |     # Initialize the Fitter and execute the training
32 |     fitter = Fitter(model=model, epochs=n_epochs, config=config, strategy=strategy)
33 |     fitter.execute()
34 | 
35 | def main():
36 |     model = make_model()
37 |     fit_func(model)
38 | 
39 | if __name__ == "__main__":
40 |     main()
41 | 


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-kubeflow-1/Dockerfile:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | FROM python:3.11-slim-bullseye
11 | 
12 | WORKDIR /app
13 | 
14 | RUN apt-get update && apt-get install -y \
15 |     git \
16 |     && apt-get clean -y && rm -rf /var/lib/apt/lists/*
17 | 
18 | COPY pyproject.toml pyproject.toml
19 | COPY src src
20 | RUN pip install --no-cache-dir --upgrade pip setuptools wheel \
21 |     && pip install --no-cache-dir ".[torch]" --extra-index-url https://download.pytorch.org/whl/cpu
22 | 
23 | COPY tutorials/distributed-ml/torch-k8s/train-cpu.py train-cpu.py
24 | 
25 | LABEL org.opencontainers.image.authors="Matteo Bunino - matteo.bunino@cern.ch"
26 | LABEL org.opencontainers.image.url="https://github.com/interTwin-eu/itwinai"
27 | LABEL org.opencontainers.image.documentation="https://itwinai.readthedocs.io/"
28 | LABEL org.opencontainers.image.source="https://github.com/interTwin-eu/itwinai"
29 | LABEL org.opencontainers.image.vendor="CERN - European Organization for Nuclear Research"


--------------------------------------------------------------------------------
/use-cases/mnist/tensorflow/startscript.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # general configuration of the job
 4 | #SBATCH --job-name=PrototypeTest
 5 | #SBATCH --account=intertwin
 6 | #SBATCH --mail-user=
 7 | #SBATCH --mail-type=ALL
 8 | #SBATCH --output=job.out
 9 | #SBATCH --error=job.err
10 | #SBATCH --time=00:30:00
11 | 
12 | # configure node and process count on the CM
13 | #SBATCH --partition=batch
14 | #SBATCH --nodes=2
15 | #SBATCH --ntasks-per-node=1
16 | #SBATCH --cpus-per-task=4
17 | #SBATCH --gpus-per-node=4
18 | 
19 | #SBATCH --exclusive
20 | 
21 | # gres options have to be disabled for deepv
22 | #SBATCH --gres=gpu:4
23 | 
24 | # load modules
25 | ml --force purge
26 | ml Stages/2024 GCC/12.3.0 OpenMPI CUDA/12 MPI-settings/CUDA Python/3.11 HDF5 PnetCDF libaio mpi4py CMake cuDNN/8.9.5.29-CUDA-12
27 | 
28 | # shellcheck source=/dev/null
29 | source ~/.bashrc
30 | 
31 | # Using legacy (2.16) version of Keras
32 | # Latest version with TF (2.16) installs Keras 3.3
33 | # which returns an error for multi-node execution
34 | export TF_USE_LEGACY_KERAS=1
35 | 
36 | # ON LOGIN NODE download datasets:
37 | # ../../../.venv-tf/bin/itwinai exec-pipeline --config_name pipeline +pipe_key=pipeline +pipe_steps=[0]
38 | source ../../../envAItf_hdfml/bin/activate
39 | srun itwinai exec-pipeline --config-name pipeline +pipe_key=pipeline verbose=2
40 | 


--------------------------------------------------------------------------------
/docs/how-it-works/training/training.rst:
--------------------------------------------------------------------------------
 1 | Training a neural network
 2 | ===========================
 3 | 
 4 | **Author(s)**: Matteo Bunino (CERN)
 5 | 
 6 | itwinai aims at simplifying the way you train deep learning models, helping you to scale training to HPC resources,
 7 | while integrating popular logging frameworks, such as MLFlow, Weights&Biases, and Tensorboard.
 8 | 
 9 | itwinai TorchTrainer
10 | -------------------------
11 | 
12 | Below, you can find some tutorials that will help you getting familiar with the itwinai **TorchTrainer**:
13 | 
14 | .. raw:: html
15 |    
16 |     <iframe width="560" height="315" src="https://www.youtube.com/embed/m95W9T2FYLI?si=ETAzykd4DQDEOud7" title="TorchTrainer [Part 1]" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
17 | 
18 | |
19 | 
20 | .. raw:: html
21 |    
22 |     <iframe width="560" height="315" src="https://www.youtube.com/embed/u85RBTwS2nc?si=etL9Bxma6SMsb5yT" title="TorchTrainer [Part 2]" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
23 | 
24 | |
25 | 
26 | .. include:: explain_ddp.rst
27 | 


--------------------------------------------------------------------------------
/env-files/tensorflow/generic_tf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # --------------------------------------------------------------------------------------
 4 | # Part of the interTwin Project: https://www.intertwin.eu/
 5 | #
 6 | # Created by: Matteo Bunino
 7 | #
 8 | # Credit:
 9 | # - Jarl Sondre Sæther <jarl.sondre.saether@cern.ch> - CERN
10 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
11 | # --------------------------------------------------------------------------------------
12 | 
13 | if [ -z "$ENV_NAME" ]; then
14 |   ENV_NAME=".venv-tf"
15 | fi
16 | 
17 | work_dir=$PWD
18 | 
19 | # Create the python venv if it doesn't already exist
20 | if [ -d "${work_dir}/$ENV_NAME" ];then
21 |   echo "env $ENV_NAME already exists"
22 | else
23 |   python3 -m venv $ENV_NAME
24 |   echo "$ENV_NAME environment is created in ${work_dir}"
25 | fi
26 | 
27 | source $ENV_NAME/bin/activate
28 | 
29 | if [ -z "$NO_CUDA" ]; then
30 |   TF_EXTRA="tf"
31 | else
32 |   TF_EXTRA="tf-cuda"
33 | fi
34 | pip install --no-cache-dir -e ".[$TF_EXTRA,dev]"
35 | 
36 | # Install Prov4ML
37 | if [[ "$(uname)" == "Darwin" ]]; then
38 |   pip install --no-cache-dir "prov4ml[apple]@git+https://github.com/matbun/ProvML@v0.0.2"
39 | else
40 |   # Assuming Nvidia GPUs are available
41 |   pip install --no-cache-dir "prov4ml[nvidia]@git+https://github.com/matbun/ProvML@v0.0.2"
42 | fi


--------------------------------------------------------------------------------
/use-cases/eurac/slurm_config.yaml:
--------------------------------------------------------------------------------
 1 | # If you use this with in the runall or scaling mode, keep in mind that the strategies
 2 | # will change, as well as the number of nodes. 
 3 | #
 4 | # Default arguments can be seen in src/itwinai/slurm/utils.py
 5 | 
 6 | mode: single # "single", "runall" or "scaling-test" - defaults to "single"
 7 | dist_strat: ddp # "ddp", "deepspeed" or "horovod"
 8 | 
 9 | account: intertwin
10 | time: 02:00:00
11 | partition: develbooster
12 | 
13 | std_out: slurm_job_logs/${dist_strat}.out
14 | err_out: slurm_job_logs/${dist_strat}.err
15 | job_name: eurac-${dist_strat}-job
16 | 
17 | num_nodes: 2
18 | num_tasks_per_node: 1
19 | gpus_per_node: 4
20 | cpus_per_task: 16
21 | 
22 | python_venv: ../../.venv
23 | pipe_key: training_pipeline
24 | config_path: .
25 | config_name: config
26 | 
27 | # The different number of nodes to use for the scalability testing
28 | scalability_nodes: "1, 2, 4, 8"
29 | 
30 | # Variables in the curly brackets, "{}", will be overridden by the builder
31 | training_cmd: "$(which itwinai) exec-pipeline \
32 |   --config-path {config_path} \
33 |   --config-name {config_name}
34 |   +pipe_key={pipe_key} \
35 |   strategy={dist_strat}"
36 | 
37 | # WARNING: If you, in the CLI, override any of the variables specified in the curly
38 | # brackets above, there will likely be a mismatch in the builder, causing potential
39 | # bugs.
40 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi.yml:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | name: Upload Python Package to PyPI when a Release is Created
11 | 
12 | on:
13 |   release:
14 |     types: [created]
15 | 
16 | jobs:
17 |   pypi-publish:
18 |     name: Publish release to PyPI
19 |     runs-on: ubuntu-latest
20 |     environment:
21 |       name: pypi
22 |       url: https://pypi.org/p/itwinai
23 |     permissions:
24 |       id-token: write
25 |     steps:
26 |       - uses: actions/checkout@v6
27 |       - name: Set up Python
28 |         uses: actions/setup-python@v6
29 |         with:
30 |           python-version: "3.x"
31 |       - name: Install dependencies
32 |         run: |
33 |           python -m pip install --upgrade pip
34 |           pip install -q build
35 |         #   pip install setuptools wheel
36 |       - name: Build package
37 |         run:  python -m build
38 |         #   python setup.py sdist bdist_wheel  # Could also be python -m build
39 |       - name: Publish package distributions to PyPI
40 |         uses: pypa/gh-action-pypi-publish@release/v1


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | submodules:
 9 |   include:
10 |     - tutorials/plugins
11 |   recursive: true
12 | 
13 | # Set the OS, Python version and other tools you might need
14 | build:
15 |   os: ubuntu-22.04
16 |   tools:
17 |     python: "3.10"
18 |     # You can also specify other tool versions:
19 |     # nodejs: "19"
20 |     # rust: "1.64"
21 |     # golang: "1.19"
22 |   apt_packages:
23 |     - gcc-11
24 |     - g++-11
25 |     # - cmake
26 |     - pandoc
27 | 
28 |   jobs:
29 |     pre_build:
30 |       - typer itwinai.cli  utils docs --output docs/api/cli.md
31 |       - python docs/convert_admonitions.py --dir docs/
32 | 
33 | # Build documentation in the "docs/" directory with Sphinx
34 | sphinx:
35 |   configuration: docs/conf.py
36 |   fail_on_warning: true  # Equivalent to -W in the Makefile
37 | 
38 | # Optionally build your docs in additional formats such as PDF and ePub
39 | # formats:
40 | #    - pdf
41 | #    - epub
42 | 
43 | # Optional but recommended, declare the Python requirements required
44 | # to build your documentation
45 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
46 | python:
47 |    install:
48 |    - requirements: docs/requirements.txt
49 | 


--------------------------------------------------------------------------------
/use-cases/cyclones/README.md:
--------------------------------------------------------------------------------
 1 | # Tropical cyclone detection
 2 | 
 3 | **Integration author(s)**: Matteo Bunino (CERN), Roman Machacek (CERN), Mario Ruettgers (JSC)
 4 | 
 5 | The code is adapted from the CMCC use case's
 6 | [repository](https://github.com/CMCC-Foundation/ml-tropical-cyclones-detection).
 7 | 
 8 | ## Setup env
 9 | 
10 | ```bash
11 | # After activating the environment
12 | pip install -r requirements.txt
13 | ```
14 | 
15 | ## Dataset
16 | 
17 | If the automatic download from python does not work, try from the command line from
18 | within the virtual environment:
19 | 
20 | ```bash
21 | gdown https://drive.google.com/drive/folders/1TnmujO4T-8_j4bCxqNe5HEw9njJIIBQD -O data/tmp_data/trainval --folder
22 | ```
23 | 
24 | For more info visit the [gdown](https://github.com/wkentaro/gdown) repository.
25 | 
26 | ## Training
27 | 
28 | Launch training:
29 | 
30 | ```bash
31 | # # ONLY IF tensorflow>=2.16
32 | # export TF_USE_LEGACY_KERAS=1
33 | 
34 | source ../../.venv-tf/bin/activate
35 | python train.py -p pipeline.yaml 
36 | ```
37 | 
38 | On JSC, the dataset is pre-downloaded and you can use the following command:
39 | 
40 | ```bash
41 | # # ONLY IF tensorflow>=2.16
42 | # export TF_USE_LEGACY_KERAS=1
43 | 
44 | source ../../envAItf_hdfml/bin/activate
45 | python train.py -p pipeline.yaml --data_path /p/project/intertwin/smalldata/cmcc
46 | 
47 | # Launch a job with SLURM
48 | sbatch startscript.sh
49 | ```
50 | 


--------------------------------------------------------------------------------
/tutorials/distributed-ml/tf-scaling-test-jube/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmarking tutorial using JUBE
 2 | 
 3 | Benchmarking of itwinai can also be performed with the JUBE Benchmarking Environment from JSC.
 4 | The JUBE benchmarking tool is already setup in the environment files provided under `env-files`.
 5 | 
 6 | ## Source the environment
 7 | 
 8 | Find the location of your environment file along with the module load commands, such as:
 9 | 
10 | ```bash
11 | ml Stages/2024 GCC/12.3.0 OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py CMake  cuDNN/8.9.5.29-CUDA-12
12 | source envAI_hdfml/bin/activate
13 | ```
14 | 
15 | ## Run benchmark
16 | 
17 | The benchmarks are defined in the `general_jobsys.xml` file.
18 | One can specify the configurations in terms of parameters such as the number of nodes.
19 | The benchmark can be simply launched with the command:
20 | 
21 | ```bash
22 | jube run general_jobsys.xml
23 | ```
24 | 
25 | ## Monitor status of benchmark run
26 | 
27 | The status of the run can be monitored with:
28 | 
29 | ```bash
30 | jube continue bench_run --id last
31 | ```
32 | 
33 | ## Check results of the benchmark run
34 | 
35 | The results can be viewed with:
36 | 
37 | ```bash
38 | jube result -a bench_run --id last
39 | ```
40 | 
41 | This will create `result-csv.dat` file in the `results` folder.
42 | 
43 | The scaling and efficiency plots can be generated with the `bench_plot.ipynb` file
44 | which takes the `result-csv.dat` file as input.
45 | 


--------------------------------------------------------------------------------
/use-cases/3dgan/run-provenance-experiments.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # --------------------------------------------------------------------------------------
 4 | # Part of the interTwin Project: https://www.intertwin.eu/
 5 | #
 6 | # Created by: Matteo Bunino
 7 | #
 8 | # Credit:
 9 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
10 | # --------------------------------------------------------------------------------------
11 | 
12 | rm -rf slurm_logs mllogs
13 | mkdir slurm_logs
14 | 
15 | # ========== FOR JSC ==========
16 | # ml --force purge
17 | # ml Stages/2024 GCC CUDA/12 cuDNN Python
18 | 
19 | # SLURM_SCRIPT="slurm.jsc.sh"
20 | # source ../../envAI_hdfml/bin/activate
21 | # ========== FOR JSC ==========
22 | 
23 | # ========== FOR Vega ==========
24 | SLURM_SCRIPT="slurm.vega.sh"
25 | source ../../.venv-pytorch/bin/activate
26 | # ========== FOR Vega ==========
27 | 
28 | # Launch experiments
29 | 
30 | # 1 worker: no SLURM needed
31 | itwinai exec-pipeline 1> slurm_logs/1_worker.out 2> slurm_logs/1_worker.err
32 | 
33 | # 4, 8, 16... workers
34 | sbatch --wait --nodes=1 --output=slurm_logs/4_worker.out --error=slurm_logs/4_worker.err $SLURM_SCRIPT
35 | sbatch --wait --nodes=2 --output=slurm_logs/8_worker.out --error=slurm_logs/8_worker.err $SLURM_SCRIPT
36 | sbatch --wait --nodes=4 --output=slurm_logs/16_worker.out --error=slurm_logs/16_worker.err $SLURM_SCRIPT
37 | sbatch --wait --nodes=8 --output=slurm_logs/32_worker.out --error=slurm_logs/32_worker.err $SLURM_SCRIPT


--------------------------------------------------------------------------------
/ci/src/main/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | """Dagger module for itwinai CI.
11 | 
12 | This module provides logic to build containers, run tests with pytest, and more.
13 | 
14 | Since itwinai is designed for HPC deployment, the containers need to be tested on relevant
15 | computing environments with hardware (e.g., GPUs) and software (e.g. CUDA) not accessible
16 | in standard GitHub actions VMs. Through an in-pipeline deployment of interLink, we can
17 | offload some tests to run on HPC.
18 | 
19 | By deploying interLink within the CI pipeline, some tests can be offloaded to run on HPC.
20 | 
21 | Additionally, since HPC systems prefer Singularity/Apptainer images over Docker, this
22 | module enables the conversion and publication of Docker containers as SIF files.
23 | 
24 | Two CI pipelines are provided: a development pipeline, which is simpler and does not
25 | run tests on HPC, and a release pipeline, where containers undergo thorough testing on
26 | HPC, are converted to Singularity, and are pushed to both Docker and Singularity
27 | container registries.
28 | """
29 | 
30 | from .main import Itwinai as Itwinai
31 | 
32 | __all__ = ["Itwinai"]
33 | 


--------------------------------------------------------------------------------
/tests/use-cases/conftest.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # - Anna Lappe <anna.elisa.lappe@cern.ch> - CERN
 9 | # --------------------------------------------------------------------------------------
10 | 
11 | import os
12 | import subprocess
13 | from typing import Callable
14 | 
15 | import pytest
16 | 
17 | FNAMES = [
18 |     "pipeline.yaml",
19 |     "startscript",
20 | ]
21 | 
22 | 
23 | @pytest.fixture
24 | def check_folder_structure() -> Callable:
25 |     """Verify that the use case folder complies with some predefined structure."""
26 | 
27 |     def _check_structure(root: str):
28 |         for fname in FNAMES:
29 |             fpath = os.path.join(root, fname)
30 |             assert os.path.isfile(fpath), f"'{fname}' is missing in '{fpath}'"
31 | 
32 |     return _check_structure
33 | 
34 | 
35 | @pytest.fixture
36 | def install_requirements() -> Callable:
37 |     """Install requirements.txt, if present in root folder."""
38 | 
39 |     def _install_reqs(root: str, env_prefix: str):
40 |         req_path = os.path.join(root, "requirements.txt")
41 |         if os.path.isfile(req_path):
42 |             cmd = f"{env_prefix}/bin/pip install --no-cache-dir -r {req_path}"
43 |             subprocess.run(cmd.split(), check=True)
44 | 
45 |     return _install_reqs
46 | 


--------------------------------------------------------------------------------
/docs/getting-started/plugins-list.rst:
--------------------------------------------------------------------------------
 1 | Current List of itwinai Plugins
 2 | ===============================
 3 | 
 4 | Below is a list of existing **itwinai plugins**, which correspond to scientific use cases that have been integrated into the itwinai framework.
 5 | 
 6 | Physics Sciences
 7 | ----------------
 8 | 
 9 | - `3DGAN – Fast Simulation of Particles in Calorimeters <https://github.com/interTwin-eu/itwinai-3dgan-plugin>`__
10 | - `GlitchFlow – Noise Generation for Gravitational Waves Analysis at Virgo <https://github.com/interTwin-eu/glitchflow-itwinai-plugin>`__
11 | - `Pulsar Detection (Radio Astronomy) <https://github.com/interTwin-eu/pulsar-plugin>`__
12 | - `Machine Learned Particle Flow Reconstruction (MLPF) <https://github.com/matbun/mlpf-itwinai-plugin>`__
13 | - `Normflow - Normalizing flows as a generative model for lattice field theory <https://github.com/interTwin-eu/normflow-plugin>`__
14 | 
15 | Environmental Sciences
16 | -----------------------
17 | 
18 | - `Hython – Hydrological Modelling for Drought Early Warnings <https://github.com/interTwin-eu/hython-itwinai-plugin>`__
19 | - `xtclim – ML-based extreme events detection and characterization (CERFACS) <https://github.com/interTwin-eu/xtclim>`__
20 | - `AtmoRep – A Stochastic Model of Atmosphere Dynamics <https://github.com/matbun/atmorep-itwinai-plugin>`__
21 | 
22 | Contribute Your Plugin
23 | -----------------------
24 | 
25 | If you are developing a plugin and would like it to be listed on this page, feel free to open a pull request to update it!
26 | 


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-scaling-test/slurm_config.yaml:
--------------------------------------------------------------------------------
 1 | # If you use this with in the runall or scaling mode, keep in mind that the strategies
 2 | # will change, as well as the number of nodes. 
 3 | #
 4 | # Default arguments can be seen in src/itwinai/slurm/utils.py
 5 | #
 6 | num_nodes: 1
 7 | num_tasks_per_node: 1
 8 | gpus_per_node: 4
 9 | cpus_per_task: 16
10 | 
11 | mode: single # "single", "runall" or "scaling-test" - defaults to "single"
12 | dist_strat: ddp # "ddp", "deepspeed" or "horovod"
13 | itwinai_trainer: false
14 | 
15 | account: intertwin
16 | time: 00:15:00
17 | partition: develbooster
18 | 
19 | # Keep in mind that these will be overwritten if "mode" is not "single", and that
20 | # if you override the dist_strat in the CLI, then these will already have evaluated
21 | # and thus might not correspond. Thus, we suggest you only change the dist_strat in
22 | # the config and avoid overriding it in the CLI.
23 | std_out: slurm_job_logs/${dist_strat}-${num_nodes}x${gpus_per_node}.out
24 | err_out: slurm_job_logs/${dist_strat}-${num_nodes}x${gpus_per_node}.err
25 | job_name: tutorial-${dist_strat}-job
26 | 
27 | # The different number of nodes to use for the scalability testing
28 | scalability_nodes: "1, 2, 4"
29 | 
30 | python_venv: ../../../.venv
31 | 
32 | # If you want to manually override the training command, comment in the following:
33 | # training_cmd: | 
34 | #   $(which itwinai) exec-pipeline \
35 | #   --config_path ${config_file} \
36 | #   +pipe_key ${pipe_key} \
37 | #   strategy=${dist_strat} \
38 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | Thank you for opening an issue in our repository.
 3 | Please use the template below to construct the issue.
 4 | 
 5 | Dealing with issues:
 6 | - Issues opened here will be evaluated by the maintainers, and given priority
 7 |   based on that evaluation.
 8 | - Support is provided on a best-effort basis
 9 | - See the CODE_OF_CONDUCT.md for a deeper description of how we deal with support
10 |   and issues.
11 | -->
12 | 
13 | # Short Description of the issue
14 | 
15 | <!--
16 | Please provide a plain-language description of what you would like to report.
17 | By using simple, concise language, you can help the maintainers understand the
18 | issue and context, and thereby help them prioritise it.
19 | -->
20 | 
21 | ## Environment
22 | 
23 | <!--
24 | Provide details of the environment you used when this error occurred
25 | -->
26 | 
27 | - Operating System:
28 | - Other related components versions:
29 | 
30 | ## Steps to reproduce
31 | 
32 | <!--
33 | If this is a runtime or other error, please describe what you did to generate the
34 | error
35 | -->
36 | 
37 | ## Logs, stacktrace, or other symptoms
38 | 
39 | <!--
40 | If you have logs or other supporting information like the stack trace from python,
41 | etc, paste it here.
42 | Use markdown formatting to put output in code blocks
43 | -->
44 | 
45 | ```shell
46 | output
47 | ```
48 | 
49 | <!-- the section below is optional - remove it if you don't know what to propose,
50 | but merely want to report an issue.  -->
51 | 
52 | # Summary of proposed changes
53 | 


--------------------------------------------------------------------------------
/src/itwinai/tensorflow/models/mnist.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Roman Machacek
 5 | #
 6 | # Credit:
 7 | # - Roman Machacek <roman.machacek@cern.ch> - CERN
 8 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 9 | # --------------------------------------------------------------------------------------
10 | 
11 | 
12 | # import tensorflow.keras as keras
13 | from typing import List
14 | 
15 | import tensorflow as tf
16 | 
17 | 
18 | class MNIST_Model(tf.keras.Model):
19 |     def __init__(self, input_shape: List[int] = (28, 28, 1), output_shape: int = 10):
20 |         super().__init__()
21 | 
22 |         # LeNet5
23 |         self.model = tf.keras.Sequential(
24 |             [
25 |                 tf.keras.layers.Conv2D(
26 |                     filters=6, kernel_size=(3, 3), activation="relu", input_shape=(28, 28, 1)
27 |                 ),
28 |                 tf.keras.layers.AveragePooling2D(2),
29 |                 tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3), activation="relu"),
30 |                 tf.keras.layers.AveragePooling2D(2),
31 |                 tf.keras.layers.Flatten(),
32 |                 tf.keras.layers.Dense(units=120, activation="relu"),
33 |                 tf.keras.layers.Dense(units=84, activation="relu"),
34 |                 tf.keras.layers.Dense(units=10),
35 |             ]
36 |         )
37 | 
38 |     def call(self, inputs):
39 |         return self.model(inputs)
40 | 


--------------------------------------------------------------------------------
/env-files/torch/jupyter/asyncssh_config.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # before it was !/opt/conda/bin/python
 3 | # -*- coding: utf-8 -*-
 4 | #
 5 | # D. Ciangottini
 6 | #
 7 | import asyncio
 8 | import os
 9 | import re
10 | import sys
11 | from subprocess import Popen
12 | 
13 | import asyncssh
14 | from jupyterhub.singleuser import main
15 | 
16 | ssh_host = os.environ.get("JHUB_HOST")
17 | ssh_url_port = os.environ.get("SSH_PORT")
18 | username = os.environ.get("JUPYTERHUB_USER")
19 | token = os.environ.get("JUPYTERHUB_API_TOKEN")
20 | 
21 | fwd_port = os.environ.get("FWD_PORT")
22 | 
23 | 
24 | async def run_client():
25 |     async with asyncssh.connect(
26 |         host=ssh_host,
27 |         port=int(ssh_url_port),
28 |         username=username,
29 |         password=token,
30 |         known_hosts=None,
31 |     ) as conn:
32 |         conn.set_keepalive(interval=14.0, count_max=10)
33 |         listener = await conn.forward_remote_port(
34 |             "0.0.0.0",
35 |             int(fwd_port),
36 |             "0.0.0.0",
37 |             int(fwd_port),
38 |         )
39 |         await listener.wait_closed()
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     print("Connecting ssh...")
44 |     loop = asyncio.get_event_loop()
45 |     loop.create_task(run_client())
46 | 
47 |     print("Configuring Rucio extension...")
48 |     p = Popen(["/usr/local/bin/setup.sh"])
49 |     while p.poll() is None:
50 |         pass
51 | 
52 |     print("Starting JLAB")
53 |     sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0])
54 |     sys.exit(main())
55 | 


--------------------------------------------------------------------------------
/env-files/torch/README.md:
--------------------------------------------------------------------------------
 1 | # Container image definition files for PyTorch-based itwinai
 2 | 
 3 | ## Singularity
 4 | 
 5 | This example is for building the itwinai container for LUMI (AMD GPUs) locally (use `scp` to transfer the final image
 6 | to LUMI)
 7 | 
 8 | First navigate with `cd` to the base folder of itwinai.
 9 | 
10 | From there, download the singularity base image for pytorch with ROCm:
11 | 
12 | ```bash
13 | singularity pull rocm-base-pytorch.sif REGISTRY_IMG
14 | ```
15 | 
16 | You can choose the following base images:
17 | 
18 | - `oras://registry.egi.eu/dev.intertwin.eu/itwinai-dev:lumi-pytorch-rocm-6.1.3-python-3.12-pytorch-v2.4.1`
19 | - `oras://registry.cern.ch/itwinai/lumi:lumi-pytorch-rocm-6.2.4-python-3.12-pytorch-v2.6.0-dockerhash-ef203c810cc9`
20 | 
21 | Other base images can be found on LUMI at `/appl/local/containers/tested-containers` and
22 | `/appl/local/containers/sif-images`. See the
23 | [docs](https://lumi-supercomputer.github.io/LUMI-EasyBuild-docs/p/PyTorch/#getting-the-container-image)
24 | for more info.
25 | 
26 | Then build the final container with:
27 | 
28 | ```bash
29 | sudo singularity build --tmpdir /tmp itwinai-lumi-dev.sif env-files/torch/rocm.def
30 | ```
31 | 
32 | - `/tmp` is a location with enough storage space to support the build.
33 | 
34 | Available itwinai images can be found at:
35 | 
36 | - `oras://registry.egi.eu/dev.intertwin.eu/itwinai-dev:lumi-itwinai-pytorch-rocm-6.1.3-python-3.12-pytorch-v2.4.1`
37 | - `oras://registry.cern.ch/itwinai/lumi:itwinai0.3.3-lumi-pytorch-rocm-6.2.4-python-3.12-pytorch-v2.6.0-dockerhash-ef203c810cc9`
38 | 


--------------------------------------------------------------------------------
/docs/testing-with-pytest.md:
--------------------------------------------------------------------------------
 1 | # Test with `pytest`
 2 | 
 3 | Do this only if you are a developer wanting to test your code with pytest.
 4 | 
 5 | First, you need to create virtual environments both for torch and tensorflow,
 6 | following the instructions above, depending on the system that you are using
 7 | (e.g., JSC).
 8 | 
 9 | To select the name of the torch and tf environments in which the tests will be
10 | executed you can set the following environment variables.
11 | If these env variables are not set, the testing suite will assume that the
12 | PyTorch environment is under
13 | `.venv-pytorch` and the TensorFlow environment is under `.venv-tf`.
14 | 
15 | ```bash
16 | export TORCH_ENV="my_torch_env"
17 | export TF_ENV="my_tf_env"
18 | ```
19 | 
20 | Functional tests (marked with `pytest.mark.functional`) will be executed under
21 | `/tmp/pytest` location to guarantee isolation among tests.
22 | 
23 | To run functional tests use:
24 | 
25 | ```bash
26 | pytest -v tests/ -m "functional"
27 | ```
28 | 
29 | > [!NOTE]
30 | > Depending on the system that you are using, we implemented a tailored Makefile
31 | > target to run the test suite on it. Read these instructions until the end!
32 | 
33 | We provide some Makefile targets to run the whole test suite including unit, integration,
34 | and functional tests. Choose the right target depending on the system that you are using:
35 | 
36 | Makefile targets:
37 | 
38 | - Juelich Supercomputer (JSC): `test-jsc`
39 | - In any other case: `test`
40 | 
41 | For instance, to run the test suite on your laptop user:
42 | 
43 | ```bash
44 | make test
45 | ```
46 | 


--------------------------------------------------------------------------------
/use-cases/virgo/slurm_config.yaml:
--------------------------------------------------------------------------------
 1 | # If you use this with in the runall or scaling mode, keep in mind that the strategies
 2 | # will change, as well as the number of nodes. 
 3 | #
 4 | # Default arguments can be seen in src/itwinai/slurm/utils.py
 5 | 
 6 | mode: single # "single", "runall" or "scaling-test" - defaults to "single"
 7 | dist_strat: ddp # "ddp", "deepspeed" or "horovod"
 8 | 
 9 | account: intertwin
10 | time: 00:30:00
11 | partition: develbooster
12 | 
13 | num_nodes: 1
14 | num_tasks_per_node: 1
15 | gpus_per_node: 4
16 | cpus_per_task: 16
17 | 
18 | # Keep in mind that these will be overwritten if "mode" is not "single", and that
19 | # if you override the dist_strat in the CLI, then these will already have evaluated
20 | # and thus might not correspond. Thus, we suggest you only change the dist_strat in
21 | # the config and avoid overriding it in the CLI.
22 | std_out: slurm_job_logs/${dist_strat}-${num_nodes}x${gpus_per_node}.out
23 | err_out: slurm_job_logs/${dist_strat}-${num_nodes}x${gpus_per_node}.err
24 | job_name: virgo-${dist_strat}-job
25 | ##################################
26 | 
27 | python_venv: ../../.venv
28 | pipe_key: training_pipeline
29 | config_path: .
30 | config_name: config
31 | 
32 | 
33 | # The different number of nodes to use for the scalability testing
34 | scalability_nodes: "1, 2, 4"
35 | 
36 | # If you want to manually override the training command, comment in the following:
37 | training_cmd: "$(which itwinai) exec-pipeline \
38 |   --config-path {config_path} \
39 |   --config-name {config_name}
40 |   +pipe_key={pipe_key} \
41 |   strategy={dist_strat}"
42 | 


--------------------------------------------------------------------------------
/env-files/torch/install-horovod-deepspeed-cuda.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # --------------------------------------------------------------------------------------
 4 | # Part of the interTwin Project: https://www.intertwin.eu/
 5 | #
 6 | # Created by: Jarl Sondre Sæther
 7 | #
 8 | # Credit:
 9 | # - Jarl Sondre Sæther <jarl.sondre.saether@cern.ch> - CERN
10 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
11 | # --------------------------------------------------------------------------------------
12 | 
13 | set -e 
14 | 
15 | # DeepSpeed variables
16 | export DS_BUILD_CCL_COMM=1
17 | export DS_BUILD_UTILS=1
18 | export DS_BUILD_AIO=1
19 | export DS_BUILD_FUSED_ADAM=1
20 | export DS_BUILD_FUSED_LAMB=1
21 | export DS_BUILD_TRANSFORMER=1
22 | export DS_BUILD_STOCHASTIC_TRANSFORMER=1
23 | export DS_BUILD_TRANSFORMER_INFERENCE=1
24 | 
25 | # Use --no-cache-dir to avoid caching packages in your $HOME, which may have small disk quota
26 | uv pip install --no-cache-dir --no-build-isolation "deepspeed==0.16.8"
27 | 
28 | # Horovod variables
29 | export LDSHARED="$CC -shared" &&
30 | export CMAKE_CXX_STANDARD=17
31 | 
32 | export HOROVOD_MPI_THREADS_DISABLE=1
33 | export HOROVOD_CPU_OPERATIONS=MPI
34 | 
35 | export HOROVOD_GPU_ALLREDUCE=NCCL
36 | export HOROVOD_NCCL_LINK=SHARED
37 | export HOROVOD_NCCL_HOME=$EBROOTNCCL
38 | 
39 | export HOROVOD_WITH_PYTORCH=1
40 | export HOROVOD_WITHOUT_TENSORFLOW=1
41 | export HOROVOD_WITHOUT_MXNET=1
42 | 
43 | uv pip install --no-cache-dir --no-build-isolation git+https://github.com/horovod/horovod.git@3a31d93 
44 | 
45 | echo "Finished Horovod and DeepSpeed installation script!"
46 | 


--------------------------------------------------------------------------------
/docs/tutorials/distrib-ml/torch-tutorial-containers.rst:
--------------------------------------------------------------------------------
 1 | itwinai and containers (Docker and Singularity)
 2 | ===================================================
 3 | 
 4 | In this tutorial you will learn how to use itwinai's containers images to run your ML workflows
 5 | without having to setup the python environment by means of virtual environments.
 6 | 
 7 | .. include:: ../../../tutorials/distributed-ml/torch-tutorial-containers/README.md
 8 |    :parser: myst_parser.sphinx_
 9 | 
10 | 
11 | Shell scripts
12 | --------------
13 | 
14 | run_docker.sh
15 | ++++++++++++++++
16 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-containers/run_docker.sh
17 |    :language: bash
18 | 
19 | slurm.sh
20 | ++++++++++++
21 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-containers/slurm.sh
22 |    :language: bash
23 | 
24 | 
25 | runall.sh
26 | ++++++++++++++++
27 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-containers/runall.sh
28 |    :language: bash
29 | 
30 | 
31 | Pipeline configuration
32 | -----------------------
33 | 
34 | config.yaml
35 | ++++++++++++
36 | 
37 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-containers/config.yaml
38 |    :language: yaml
39 | 
40 | 
41 | Python files 
42 | ------------------
43 | 
44 | model.py
45 | ++++++++++++
46 | 
47 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-containers/model.py
48 |    :language: python
49 | 
50 | dataloader.py
51 | +++++++++++++++
52 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-containers/dataloader.py
53 |    :language: python
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/docs/use-cases/3dgan_doc.rst:
--------------------------------------------------------------------------------
 1 | Fast particle detector simulation (CERN) 
 2 | ========================================
 3 | 
 4 | This use case trains a 3D Generative Adversarial Network (3DGAN) for
 5 | generation of images of calorimeter depositions. It is based on the
 6 | prototype `3DGAN <https://github.com/svalleco/3Dgan/tree/Anglegan/keras>`_ model
 7 | developed at CERN and is implemented on PyTorch Lightning framework.
 8 | 
 9 | This section covers the CERN use case that utilizes the `torch-lightning` framework 
10 | for training and evaluation. Following you can find instructions to execute CERN use 
11 | case and its integral scripts:
12 | 
13 | Integration with itwinai
14 | ------------------------
15 | 
16 | .. include:: ../../use-cases/3dgan/README.md
17 |    :parser: myst_parser.sphinx_
18 |    :start-line: 2
19 | 
20 | 
21 | 3DGAN plugin for itwinai
22 | ------------------------
23 | 
24 | The integration code of the 3DGAN model has been adapted to be distributed as an independent
25 | itwinai plugin called `itwinai-3dgan-plugin <https://github.com/interTwin-eu/itwinai-3dgan-plugin>`_. 
26 | 
27 | 
28 | Offloading jobs via interLink
29 | -----------------------------
30 | 
31 | The CERN use case also has an integration with `interLink <https://github.com/interTwin-eu/interlink>`_. You can find
32 | the relevant files in the 
33 | `interLink directory on Github <https://github.com/interTwin-eu/itwinai/tree/main/use-cases/3dgan/interLink>`_.
34 | You can also look at the README for more information:
35 | 
36 | 
37 | .. include:: ../../use-cases/3dgan/interLink/README.md
38 |    :parser: myst_parser.sphinx_
39 |    :start-line: 0
40 | 


--------------------------------------------------------------------------------
/use-cases/3dgan/slurm.jsc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # SLURM jobscript for JSC systems
 4 | 
 5 | # general configuration of the job
 6 | #SBATCH --job-name=PrototypeTest
 7 | #SBATCH --account=intertwin
 8 | #SBATCH --mail-user=
 9 | #SBATCH --mail-type=ALL
10 | #SBATCH --output=job.out
11 | #SBATCH --error=job.err
12 | #SBATCH --time=00:30:00
13 | 
14 | # configure node and process count on the CM
15 | #SBATCH --partition=batch
16 | #SBATCH --nodes=2
17 | #SBATCH --ntasks-per-node=1
18 | #SBATCH --cpus-per-task=4
19 | #SBATCH --gpus-per-node=4
20 | 
21 | #SBATCH --exclusive
22 | 
23 | # gres options have to be disabled for deepv
24 | #SBATCH --gres=gpu:4
25 | 
26 | # load modules
27 | ml --force purge
28 | ml Stages/2024 GCC CUDA/12 cuDNN Python 
29 | # ml Stages/2024 GCC OpenMPI CUDA/12 cuDNN MPI-settings/CUDA
30 | # ml Python CMake HDF5 PnetCDF libaio mpi4py
31 | 
32 | # shellcheck source=/dev/null
33 | source ~/.bashrc
34 | 
35 | # Activate the environment
36 | source ../../envAI_hdfml/bin/activate
37 | 
38 | GAN_DATASET="exp_data" #"/p/scratch/intertwin/datasets/cern/"
39 | 
40 | # launch training
41 | TRAINING_CMD="$(which itwinai) exec-pipeline num_nodes=$SLURM_NNODES \
42 |     dataset_location=$GAN_DATASET "
43 | 
44 | srun --cpu-bind=none --ntasks-per-node=1 \
45 |     bash -c "torchrun \
46 |     --log_dir='logs_torchrun' \
47 |     --nnodes=$SLURM_NNODES \
48 |     --nproc_per_node=$SLURM_GPUS_PER_NODE \
49 |     --rdzv_id=$SLURM_JOB_ID \
50 |     --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \
51 |     --rdzv_backend=c10d \
52 |     --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \
53 |     $TRAINING_CMD "


--------------------------------------------------------------------------------
/env-files/torch/generic_torch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # --------------------------------------------------------------------------------------
 4 | # Part of the interTwin Project: https://www.intertwin.eu/
 5 | #
 6 | # Created by: Matteo Bunino
 7 | #
 8 | # Credit:
 9 | # - Jarl Sondre Sæther <jarl.sondre.saether@cern.ch> - CERN
10 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
11 | # --------------------------------------------------------------------------------------
12 | 
13 | if [ -z "$ENV_NAME" ]; then
14 |   ENV_NAME=".venv-pytorch"
15 | fi
16 | 
17 | work_dir=$PWD
18 | 
19 | # Create the python venv if it doesn't already exist
20 | if [ -d "${work_dir}/$ENV_NAME" ];then
21 |   echo "env $ENV_NAME already exists"
22 | else
23 |   python3 -m venv $ENV_NAME
24 |   echo "$ENV_NAME environment is created in ${work_dir}"
25 | fi
26 | 
27 | # Activate the venv and then install itwinai as editable
28 | source $ENV_NAME/bin/activate
29 | pip install uv
30 | 
31 | if [ -z "$NO_CUDA" ]; then
32 |   # Install with CUDA support
33 |   uv pip install -e ".[torch,dev]" \
34 |     --no-cache-dir \
35 |     --extra-index-url https://download.pytorch.org/whl/cu126
36 | else
37 |   # Install without CUDA support (avoid uv here)
38 |   pip install -e ".[torch,dev]" \
39 |   --no-cache-dir \
40 |   --extra-index-url https://download.pytorch.org/whl/cpu
41 | fi
42 | 
43 | 
44 | # Install Prov4ML
45 | if [[ "$(uname)" == "Darwin" ]]; then
46 |   uv pip install --no-cache-dir  "prov4ml[apple]@git+https://github.com/matbun/ProvML@v0.0.2"
47 | else
48 |   # Assuming Nvidia GPUs are available
49 |   uv pip install --no-cache-dir  "prov4ml[nvidia]@git+https://github.com/matbun/ProvML@v0.0.2"
50 | fi
51 | 


--------------------------------------------------------------------------------
/src/itwinai/constants.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Linus Eickhoff
 5 | #
 6 | # Credit:
 7 | # - Linus Eickhoff <linus.maximilian.eickhoff@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | """constants used in the itwinai project"""
11 | from pathlib import Path
12 | 
13 | # Directory names for logging and profiling data
14 | PROFILER_TRACES_DIR_NAME = "profiler-traces"
15 | 
16 | # mlflow
17 | RELATIVE_MLFLOW_PATH = Path("mllogs/mlflow")
18 | BASE_EXP_NAME: str = "unnamed-experiment"
19 | PROFILING_AVG_NAME: str = "torch_profiling_averages"
20 | 
21 | adjectives = [
22 |     "quantum",
23 |     "relativistic",
24 |     "wavy",
25 |     "entangled",
26 |     "chiral",
27 |     "tachyonic",
28 |     "superluminal",
29 |     "anomalous",
30 |     "hypercharged",
31 |     "fermionic",
32 |     "hadronic",
33 |     "quarky",
34 |     "holographic",
35 |     "dark",
36 |     "force-sensitive",
37 |     "chaotic",
38 | ]
39 | 
40 | names = [
41 |     "neutrino",
42 |     "graviton",
43 |     "muon",
44 |     "gluon",
45 |     "tachyon",
46 |     "quasar",
47 |     "pulsar",
48 |     "blazar",
49 |     "meson",
50 |     "boson",
51 |     "hyperon",
52 |     "starlord",
53 |     "groot",
54 |     "rocket",
55 |     "yoda",
56 |     "skywalker",
57 |     "sithlord",
58 |     "midichlorian",
59 |     "womp-rat",
60 |     "beskar",
61 |     "mandalorian",
62 |     "ewok",
63 |     "vibranium",
64 |     "nova",
65 |     "gamora",
66 |     "drax",
67 |     "ronan",
68 |     "thanos",
69 |     "cosmo",
70 | ]
71 | 


--------------------------------------------------------------------------------
/use-cases/mnist/torch/create_inference_sample.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | """Create a simple inference dataset sample and a checkpoint."""
11 | 
12 | import argparse
13 | import os
14 | 
15 | import torch
16 | from dataloader import InferenceMNIST
17 | from model import Net
18 | 
19 | 
20 | def mnist_torch_inference_files(
21 |     root: str = ".",
22 |     samples_path: str = "mnist-sample-data/",
23 |     model_name: str = "mnist-pre-trained.pth",
24 | ):
25 |     """Create sample dataset and fake model to test mnist
26 |     inference workflow. Assumes to be run from
27 |     the use case folder.
28 | 
29 |     Args:
30 |         root (str, optional): where to create the files.
31 |         Defaults to '.'.
32 |     """
33 | 
34 |     sample = os.path.join(root, samples_path)
35 |     InferenceMNIST.generate_jpg_sample(sample, 10)
36 | 
37 |     # Fake checkpoint
38 |     dummy_nn = Net()
39 |     mdl_ckpt = os.path.join(root, model_name)
40 |     torch.save(dummy_nn, mdl_ckpt)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     parser.add_argument("--root", type=str, default=".")
46 |     parser.add_argument("--samples-path", type=str, default="mnist-sample-data")
47 |     parser.add_argument("--model-name", type=str, default="mnist-pre-trained.pth")
48 |     args = parser.parse_args()
49 |     mnist_torch_inference_files(**vars(args))
50 | 


--------------------------------------------------------------------------------
/docs/installation/user_installation.rst:
--------------------------------------------------------------------------------
 1 | User Installation (for Non-Developers)
 2 | ======================================
 3 | 
 4 | This guide provides step-by-step instructions for installing the ``itwinai`` library for
 5 | users.
 6 | 
 7 | .. The explanation for creating a venv is the same for developers and users
 8 | .. include:: ./software_prerequisites.rst
 9 | 
10 | 
11 | Installing the ``itwinai`` Library
12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
13 | You can choose if you want to install ``itwinai`` with support for either PyTorch or
14 | TensorFlow by using extras:
15 | 
16 | .. tab-set:: 
17 | 
18 |     .. tab-item:: PyTorch
19 | 
20 |         To install ``itwinai`` with PyTorch without GPU acceleration, you can use the
21 |         following command:
22 | 
23 |         .. code-block:: bash
24 |             
25 |             uv pip install "itwinai[torch]"
26 | 
27 |         To enable GPU acceleration, you can use the following command:
28 | 
29 |         .. code-block:: bash
30 | 
31 |             uv pip install "itwinai[torch]" \
32 |                 --extra-index-url https://download.pytorch.org/whl/cu121
33 | 
34 | 
35 |     .. tab-item:: TensorFlow
36 | 
37 |         To install ``itwinai`` with TensorFlow without GPU acceleration, you can use the
38 |         following command:
39 | 
40 |         .. code-block:: bash
41 |             
42 |             uv pip install "itwinai[tf]"
43 | 
44 |         To enable GPU acceleration, you can use the following command:
45 | 
46 |         .. code-block:: bash
47 | 
48 |             uv pip install "itwinai[tf-cuda]"
49 | 
50 | .. Explanation for installing horovod, DS, and other packages that need to be installed AFTER itwinai
51 | .. include:: ./post_itwinai_installation.rst
52 | 


--------------------------------------------------------------------------------
/.github/linters/.ruff.toml:
--------------------------------------------------------------------------------
 1 | line-length = 95
 2 | 
 3 | [lint]
 4 | select = [
 5 |   "E",     # pycodestyle errors
 6 |   "F",     # pyflakes: undefined names, unused imports, etc.
 7 |   "I",     # isort: import sorting
 8 |   "W",     # pycodestyle warnings
 9 |   "B",     # flake8-bugbear: likely bugs and bad practices (e.g. mutable defaults)
10 |   "C4",    # flake8-comprehensions: unnecessary or suboptimal comprehensions
11 |   "SIM",   # flake8-simplify: redundant ifs, returns, boolean logic
12 |   "UP",    # pyupgrade: use modern Python syntax (e.g. f-strings, `Path()` literals)
13 |   "PTH",   # flake8-use-pathlib: use pathlib instead of os.path
14 |   "N",     # pep8-naming: naming conventions for classes, functions, variables
15 | ]
16 | ignore = [
17 |   "E203",      # Whitespace before ':' – conflicts with Black
18 |   "PTH109",    # Allow os.getcwd()
19 |   "PTH122",    # Avoid replacing os.path.splitext – Path.suffix drops info (e.g. .tar.gz)
20 |   "PTH123",    # Allow use of builtin open() – Path.open() adds no real benefit
21 |   "UP006",     # Keep using typing.List/Dict/Set – prefer consistency over builtin generics
22 |   "UP035",     # Same as above – avoid auto-converting to list[]/dict[] syntax
23 |   "B904",      # Don't require `from err` in CLI code – breaks Typer/Click behavior
24 |   "SIM108",    # Don't always use ternary operators — they can be kind of hard to read sometimes
25 |   "N806",      # Allow UPPER_CASE_VARIABLE_NAMES in function scopes (for default values etc.)
26 |   "N812",      # Allow importing stuff as uppercase (e.g. function as F)
27 | ]
28 | fixable = ["ALL"]
29 | 
30 | [format]
31 | quote-style = "double"
32 | indent-style = "space"
33 | skip-magic-trailing-comma = false
34 | line-ending = "auto"
35 | 


--------------------------------------------------------------------------------
/use-cases/mnist/tensorflow/pipeline.yaml:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Roman Machacek
 5 | #
 6 | # Credit:
 7 | # - Roman Machacek <roman.machacek@cern.ch> - CERN
 8 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 9 | # --------------------------------------------------------------------------------------
10 | 
11 | # General config
12 | verbose: auto
13 | micro_batch_size: 17
14 | epochs: 3
15 | checkpoints_path: checkpoints
16 | tb_log_dir: ./logs
17 | 
18 | # Training pipeline
19 | pipeline:
20 |   _target_: itwinai.pipeline.Pipeline
21 |   steps:
22 |     - _target_: dataloader.MNISTDataGetter
23 |     - _target_: dataloader.MNISTDataPreproc
24 |       classes: 10
25 |     - _target_: itwinai.tensorflow.trainer.TensorflowTrainer
26 |       epochs: ${epochs}
27 |       micro_batch_size: ${micro_batch_size}
28 |       verbose: ${verbose}
29 |       model_compile_config:
30 |         loss:
31 |           _target_: tensorflow.keras.losses.CategoricalCrossentropy
32 |           from_logits: False
33 |         optimizer: 
34 |           _target_: tensorflow.keras.optimizers.Adam
35 |           learning_rate: 0.001
36 |       model_config:
37 |         _target_: itwinai.tensorflow.models.mnist.MNIST_Model
38 |         input_shape: [28, 28, 1]
39 |         output_shape: 10
40 |       callbacks:
41 |         - _target_: keras.callbacks.EarlyStopping
42 |           patience: 2
43 |         - _target_: keras.callbacks.ModelCheckpoint
44 |           filepath: ${checkpoints_path}/model.{epoch:02d}-{val_loss:.2f}.keras
45 |         - _target_: keras.callbacks.TensorBoard
46 |           log_dir: ${tb_log_dir}
47 | 
48 | 


--------------------------------------------------------------------------------
/use-cases/lattice-qcd/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2023 Javad Komijani
 2 | 
 3 | 
 4 | from setuptools import setup
 5 | 
 6 | 
 7 | def readme():
 8 |     with open('README.rst') as f:
 9 |         return f.read()
10 | 
11 | 
12 | packages = [
13 |         'normflow',
14 |         'normflow.action',
15 |         'normflow.lib',
16 |         'normflow.lib.combo',
17 |         'normflow.lib.indexing',
18 |         'normflow.lib.linalg',
19 |         'normflow.lib.spline',
20 |         'normflow.lib.stats',
21 |         'normflow.mask',
22 |         'normflow.mcmc',
23 |         'normflow.nn',
24 |         'normflow.nn.scalar',
25 |         'normflow.prior'
26 |         ]
27 | 
28 | package_dir = {
29 |         'normflow': 'src',
30 |         'normflow.action': 'src/action',
31 |         'normflow.lib': 'src/lib',
32 |         'normflow.lib.combo': 'src/lib/combo',
33 |         'normflow.lib.indexing': 'src/lib/indexing',
34 |         'normflow.lib.linalg': 'src/lib/linalg',
35 |         'normflow.lib.spline': 'src/lib/spline',
36 |         'normflow.lib.stats': 'src/lib/stats',
37 |         'normflow.mask': 'src/mask',
38 |         'normflow.mcmc': 'src/mcmc',
39 |         'normflow.nn': 'src/nn',
40 |         'normflow.nn.scalar': 'src/nn/scalar',
41 |         'normflow.prior': 'src/prior'
42 |         }
43 | 
44 | setup(name='normflow',
45 |       version='1.1',
46 |       description='Normalizing flow for generating lattice field configurations',
47 |       packages=packages,
48 |       package_dir=package_dir,
49 |       url='http://github.com/jkomijani/normflow',
50 |       author='Javad Komijani',
51 |       author_email='jkomijani@gmail.com',
52 |       license='MIT',
53 |       install_requires=['numpy>=1.20', 'torch>=2.0'],
54 |       zip_safe=False
55 |       )
56 | 


--------------------------------------------------------------------------------
/docs/how-it-works/training/explain_ddp.rst:
--------------------------------------------------------------------------------
 1 | Explanation of Distributed Data Parallelism
 2 | -------------------------------------------
 3 | 
 4 | **Author(s)**: Killian Verder (CERN),  Matteo Bunino (CERN)
 5 | 
 6 | Deep neural networks (DNN) are often extremely large and are trained on massive amounts
 7 | of data, more than most computers have memory for. Even smaller DNNs can take days to
 8 | train. Distributed Data Parallel (DDP) addresses these two issues, long training times
 9 | and limited memory, by using multiple machines to host and train both model and data.
10 | 
11 | Data parallelism is an easy way for a developer to vastly reduce training times. Rather
12 | than using single-node parallelism, DDP scales to multiple machines. This scaling
13 | maximises parallelisation of your model and drastically reduces training times.
14 | 
15 | Another benefit of DDP is removal of single-machine memory constraints. Since a dataset
16 | or model can be stored across several machines, it becomes possible to analyse much
17 | larger datasets or models.
18 | 
19 | Below is a list of resources expanding on theoretical aspects and practical
20 | implementations of DDP:
21 | 
22 | * Introduction to DP: https://siboehm.com/articles/22/data-parallel-training
23 | 
24 | * https://pytorch.org/tutorials/beginner/ddp_series_theory.html
25 | 
26 | * https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
27 | 
28 | * https://huggingface.co/blog/pytorch-ddp-accelerate-transformers
29 | 
30 | 
31 | Data-Parallelism with Deepspeed's Zero Redundancy Optimizer (ZeRO):
32 | 
33 | * https://sumanthrh.com/post/distributed-and-efficient-finetuning/#zero-powered-data-parallelism
34 | 
35 | 
36 | Investigation of expected performance improvement: 
37 | 
38 | * https://www.mdpi.com/2079-9292/11/10/1525
39 | 
40 | 


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-tutorial-containers/runall.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # --------------------------------------------------------------------------------------
 4 | # Part of the interTwin Project: https://www.intertwin.eu/
 5 | #
 6 | # Created by: Matteo Bunino
 7 | #
 8 | # Credit:
 9 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
10 | # --------------------------------------------------------------------------------------
11 | 
12 | # Clear SLURM logs (*.out and *.err files)
13 | rm -rf logs_slurm
14 | mkdir logs_slurm
15 | rm -rf logs_torchrun
16 | 
17 | # DDP itwinai
18 | DIST_MODE="ddp"
19 | RUN_NAME="ddp-itwinai"
20 | TRAINING_CMD='/usr/local/bin/itwinai exec-pipeline strategy=ddp'
21 | sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD" \
22 |     --job-name="$RUN_NAME-n$N" \
23 |     --output="logs_slurm/job-$RUN_NAME-n$N.out" \
24 |     --error="logs_slurm/job-$RUN_NAME-n$N.err" \
25 |     slurm.sh
26 | 
27 | # DeepSpeed itwinai
28 | DIST_MODE="deepspeed"
29 | RUN_NAME="deepspeed-itwinai"
30 | TRAINING_CMD='/usr/local/bin/itwinai exec-pipeline strategy=deepspeed'
31 | sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD" \
32 |     --job-name="$RUN_NAME-n$N" \
33 |     --output="logs_slurm/job-$RUN_NAME-n$N.out" \
34 |     --error="logs_slurm/job-$RUN_NAME-n$N.err" \
35 |     slurm.sh
36 | 
37 | # # Horovod itwinai
38 | # DIST_MODE="horovod"
39 | # RUN_NAME="horovod-itwinai"
40 | # TRAINING_CMD='/usr/local/bin/itwinai exec-pipeline strategy=horovod'
41 | # sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD" \
42 | #     --job-name="$RUN_NAME-n$N" \
43 | #     --output="logs_slurm/job-$RUN_NAME-n$N.out" \
44 | #     --error="logs_slurm/job-$RUN_NAME-n$N.err" \
45 | #     slurm.sh


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-tutorial-1-mnist/README.md:
--------------------------------------------------------------------------------
 1 | # Tutorial: distributed strategies for PyTorch model trained on MNIST dataset
 2 | 
 3 | **Author(s)**: Matteo Bunino (CERN), Jarl Sondre Sæther (CERN)
 4 | 
 5 | In this tutorial we show how to use torch `DistributedDataParallel` (DDP), Horovod and
 6 | DeepSpeed from the same client code.
 7 | Note that the environment is tested on the HDFML system at JSC. For other systems,
 8 | the module versions might need change accordingly.
 9 | 
10 | ## Setup
11 | 
12 | First, from the root of this repository, build the environment containing
13 | pytorch, horovod and deepspeed. You can *try* with:
14 | 
15 | ```bash
16 | # Creates a Python venv called envAI_hdfml
17 | make torch-gpu-jsc
18 | ```
19 | 
20 | Before launching training, since on JSC's compute nodes there is not internet connection,
21 | you need to download the dataset before while on the login lode:
22 | 
23 | ```bash
24 | source ../../../envAI_hdfml/bin/activate
25 | python train.py --download-only
26 | ```
27 | 
28 | This command creates a local folder called "MNIST" with the dataset.
29 | 
30 | ## Distributed training
31 | 
32 | You can run your training with SLURM by using the `itwinai` SLURM Builder. Use the
33 | `slurm_config.yaml` file to specify your SLURM parameters and then preview your script
34 | with the following command:
35 | 
36 | ```bash
37 | itwinai generate-slurm -c slurm_config.yaml --no-save-script --no-submit-job
38 | ```
39 | 
40 | If you are happy with the script, you can then run it by omitting `--no-submit-job`:
41 | 
42 | ```bash
43 | itwinai generate-slurm -c slurm_config.yaml --no-save-script
44 | ```
45 | 
46 | If you want to store a copy of the script in a folder, then you can similarly omit
47 | `--no-save-script`:
48 | 
49 | ```bash
50 | itwinai generate-slurm -c slurm_config.yaml
51 | ```
52 | 


--------------------------------------------------------------------------------
/src/itwinai/torch/reproducibility.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | """This module provides the tools to support reproducible execution of torch scripts."""
11 | 
12 | import random
13 | from typing import Optional
14 | 
15 | import numpy as np
16 | import torch
17 | 
18 | 
19 | def seed_worker(worker_id):
20 |     """Seed DataLoader worker."""
21 |     worker_seed = torch.initial_seed() % 2**32
22 |     np.random.seed(worker_seed)
23 |     random.seed(worker_seed)
24 | 
25 | 
26 | def set_seed(rnd_seed: Optional[int], deterministic_cudnn: bool = True) -> torch.Generator:
27 |     """Set torch random seed and return a PRNG object.
28 | 
29 |     Args:
30 |         rnd_seed (Optional[int]): random seed. If None, the seed is not set.
31 |         deterministic_cudnn (bool): if True, sets
32 |             ``torch.backends.cudnn.benchmark = False``, which may affect
33 |             performances.
34 | 
35 |     Returns:
36 |         torch.Generator: PRNG object.
37 |     """
38 |     g = torch.Generator()
39 |     if rnd_seed is not None:
40 |         # Deterministic execution
41 |         np.random.seed(rnd_seed)
42 |         random.seed(rnd_seed)
43 |         torch.manual_seed(rnd_seed)
44 |         g.manual_seed(rnd_seed)
45 |         if torch.cuda.is_available():
46 |             torch.cuda.manual_seed(rnd_seed)
47 |             torch.cuda.manual_seed_all(rnd_seed)
48 |         if deterministic_cudnn:
49 |             torch.backends.cudnn.benchmark = False
50 |             torch.backends.cudnn.deterministic = True
51 |     return g
52 | 


--------------------------------------------------------------------------------
/use-cases/mnist/tensorflow/dataloader.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Roman Machacek
 5 | #
 6 | # Credit:
 7 | # - Roman Machacek <roman.machacek@cern.ch> - CERN
 8 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 9 | # --------------------------------------------------------------------------------------
10 | 
11 | 
12 | from typing import Tuple
13 | 
14 | import tensorflow as tf
15 | import tensorflow.keras as keras
16 | 
17 | from itwinai.components import DataGetter, DataProcessor, monitor_exec
18 | 
19 | 
20 | class MNISTDataGetter(DataGetter):
21 |     def __init__(self):
22 |         super().__init__()
23 |         self.save_parameters(**self.locals2params(locals()))
24 | 
25 |     @monitor_exec
26 |     def execute(self) -> Tuple:
27 |         train, test = keras.datasets.mnist.load_data()
28 |         return train, test
29 | 
30 | 
31 | class MNISTDataPreproc(DataProcessor):
32 |     def __init__(self, classes: int):
33 |         super().__init__()
34 |         self.save_parameters(**self.locals2params(locals()))
35 |         self.classes = classes
36 | 
37 |     @monitor_exec
38 |     def execute(
39 |         self,
40 |         *datasets,
41 |     ) -> Tuple:
42 |         options = tf.data.Options()
43 |         options.experimental_distribute.auto_shard_policy = (
44 |             tf.data.experimental.AutoShardPolicy.DATA
45 |         )
46 |         preprocessed = []
47 |         for dataset in datasets:
48 |             x, y = dataset
49 |             y = keras.utils.to_categorical(y, self.classes)
50 |             sliced = tf.data.Dataset.from_tensor_slices((x, y))
51 |             sliced = sliced.with_options(options)
52 |             preprocessed.append(sliced)
53 |         return tuple(preprocessed)
54 | 


--------------------------------------------------------------------------------
/env-files/torch/createEnvJSC.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | if [ ! -f "env-files/torch/generic_torch.sh" ]; then
 5 |   echo "ERROR: env-files/torch/generic_torch.sh not found!"
 6 |   exit 1
 7 | fi
 8 | 
 9 | # set dir
10 | cDir=$PWD
11 | 
12 | # get sys info
13 | sysN="$(uname -n | cut -f2- -d.)"
14 | sysN="${sysN%%[0-9]*}"
15 | 
16 | # load modules
17 | # NOTE: REFLECT THEM IN THE MAIN README! 
18 | ml --force purge
19 | ml Stages/2025 GCC OpenMPI CUDA/12 cuDNN MPI-settings/CUDA
20 | ml Python CMake HDF5 PnetCDF libaio mpi4py git
21 | 
22 | # Create and install torch env
23 | export ENV_NAME="envAI_$sysN"
24 | bash env-files/torch/generic_torch.sh
25 | source $ENV_NAME/bin/activate
26 | 
27 | # fix IB IP config - FZJ specific
28 | if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then
29 |   sed -i -e '5,100s/^/#/' ${cDir}/envAI_${sysN}/bin/torchrun
30 |   echo """
31 | import re
32 | import sys
33 | from torch.distributed.run import main
34 | from torch.distributed.elastic.agent.server import api as sapi
35 | 
36 | def new_get_fq_hostname():
37 |     return _orig_get_fq_hostname().replace('.', 'i.', 1)
38 | 
39 | if __name__ == '__main__':
40 |     _orig_get_fq_hostname = sapi._get_fq_hostname
41 |     sapi._get_fq_hostname = new_get_fq_hostname
42 |     sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
43 |     sys.exit(main())
44 | """ >> ${cDir}/envAI_${sysN}/bin/torchrun
45 | fi
46 | 
47 | # JUBE benchmarking environment
48 | if [ -f "${cDir}/envAI_${sysN}/bin/jube" ]; then
49 |   echo 'JUBE already installed'
50 | else
51 |   pip3 install --no-cache-dir http://apps.fz-juelich.de/jsc/jube/jube2/download.php?version=latest
52 | fi
53 | 
54 | # some tests
55 | echo "unit tests:"
56 | for item in 'torch' 'deepspeed' 'horovod';do
57 |   python3 -c "import $item; print('$item version:',$item.__version__)"
58 | done
59 | 
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "editor.formatOnSave": true,
 3 |     "editor.defaultFormatter": null,
 4 |     "editor.rulers": [
 5 |         95
 6 |     ],
 7 |     "cSpell.ignoreWords": [
 8 |         "itwinpreproc",
 9 |         "typer"
10 |     ],
11 |     "cSpell.words": [
12 |         "argmax",
13 |         "autolog",
14 |         "Convolutional",
15 |         "cuda",
16 |         "dataloaders",
17 |         "dataloading",
18 |         "fromlist",
19 |         "hyperparameters",
20 |         "hyperparams",
21 |         "imagenet",
22 |         "ipython",
23 |         "itwinai",
24 |         "Lockfiles",
25 |         "logfiles",
26 |         "logits",
27 |         "Mambaforge",
28 |         "Micromamba",
29 |         "mlflow",
30 |         "mnist",
31 |         "multiclass",
32 |         "mypackage",
33 |         "NCCL",
34 |         "omegaconf",
35 |         "optim",
36 |         "plmodels",
37 |         "preds",
38 |         "preproc",
39 |         "pytest",
40 |         "pyyaml",
41 |         "relu",
42 |         "Roadmap",
43 |         "savedir",
44 |         "SLURM",
45 |         "softmax",
46 |         "tensorboard",
47 |         "torchmetrics",
48 |         "torchvision",
49 |         "venv",
50 |         "wandb"
51 |     ],
52 |     "markdownlint.run": "onType",
53 |     "markdownlint.config": {
54 |         "MD013": {
55 |             "line_length": 120
56 |         }
57 |     },
58 |     "[python]": {
59 |         "editor.defaultFormatter": "charliermarsh.ruff"
60 |     },
61 |     "python.testing.pytestArgs": [
62 |         "tests"
63 |     ],
64 |     "python.testing.unittestEnabled": false,
65 |     "python.testing.pytestEnabled": true,
66 |     "python.analysis.extraPaths": [
67 |         "./src/itwinai"
68 |     ],
69 |     "makefile.configureOnOpen": false,
70 |     "files.associations": {
71 |         "*.log.*": "log",
72 |         "*.err": "log",
73 |         "*.out": "log"
74 |     }
75 | }


--------------------------------------------------------------------------------
/docs/api/itwinai.torch.modules.rst:
--------------------------------------------------------------------------------
 1 | itwinai.torch
 2 | =============
 3 | 
 4 | 
 5 | config.py
 6 | ++++++++++++++++++
 7 | .. automodule:: itwinai.torch.config
 8 |    :members:
 9 |    :undoc-members:
10 |    :show-inheritance:
11 |    :member-order: bysource
12 | 
13 | 
14 | distributed.py
15 | ++++++++++++++
16 | .. automodule:: itwinai.torch.distributed
17 |    :members:
18 |    :undoc-members:
19 |    :show-inheritance:
20 |    :member-order: bysource
21 | 
22 | 
23 | gan.py
24 | ++++++++++++++
25 | .. automodule:: itwinai.torch.gan
26 |    :members:
27 |    :undoc-members:
28 |    :show-inheritance:
29 |    :member-order: bysource
30 | 
31 | 
32 | inference.py
33 | ++++++++++++
34 | .. automodule:: itwinai.torch.inference
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 |    :member-order: bysource
39 | 
40 | 
41 | loggers.py
42 | ++++++++++
43 | .. automodule:: itwinai.torch.loggers
44 |    :members:
45 |    :undoc-members:
46 |    :show-inheritance:
47 |    :member-order: bysource
48 | 
49 | 
50 | mlflow.py
51 | +++++++++
52 | .. automodule:: itwinai.torch.mlflow
53 |    :members:
54 |    :undoc-members:
55 |    :show-inheritance:
56 |    :member-order: bysource
57 | 
58 | 
59 | reproducibility.py
60 | ++++++++++++++++++
61 | .. automodule:: itwinai.torch.reproducibility
62 |    :members:
63 |    :undoc-members:
64 |    :show-inheritance:
65 |    :member-order: bysource
66 | 
67 | 
68 | type.py
69 | ++++++++
70 | .. automodule:: itwinai.torch.type
71 |    :members:
72 |    :undoc-members:
73 |    :show-inheritance:
74 |    :member-order: bysource
75 | 
76 | 
77 | trainer.py
78 | ++++++++++
79 | .. automodule:: itwinai.torch.trainer
80 |    :members:
81 |    :undoc-members:
82 |    :show-inheritance:
83 |    :member-order: bysource
84 | 
85 | 
86 | tuning.py
87 | ++++++++++
88 | .. automodule:: itwinai.torch.tuning
89 |    :members:
90 |    :undoc-members:
91 |    :show-inheritance:
92 |    :member-order: bysource


--------------------------------------------------------------------------------
/tests/use-cases/test_cyclones.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # - Jarl Sondre Sæther <jarl.sondre.saether@cern.ch> - CERN
 9 | # --------------------------------------------------------------------------------------
10 | """Tests for Cyclones use case.
11 | 
12 | Intended to be integration tests, to make sure that updates in the code base
13 | do not break use cases' workflows.
14 | """
15 | 
16 | import os
17 | import subprocess
18 | from pathlib import Path
19 | 
20 | import pytest
21 | 
22 | CYCLONES_PATH = Path("use-cases", "cyclones")
23 | 
24 | 
25 | @pytest.mark.skip("deprecated")
26 | def test_structure_cyclones(check_folder_structure):
27 |     """Test cyclones folder structure."""
28 |     check_folder_structure(CYCLONES_PATH)
29 | 
30 | 
31 | @pytest.mark.skip("deprecated")
32 | @pytest.mark.functional
33 | @pytest.mark.memory_heavy
34 | def test_cyclones_train_tf(tf_env, install_requirements, tmp_path):
35 |     """
36 |     Test Cyclones tensorflow trainer by running it end-to-end.
37 | 
38 |     If CMCCC_DATASET env variable is defined, it is used to
39 |     override the default dataset download location: useful
40 |     when it contains a local copy of the dataset, preventing
41 |     downloading it again.
42 |     """
43 |     # TODO: create a small sample dataset for tests only
44 |     install_requirements(CYCLONES_PATH, tf_env)
45 | 
46 |     dataset_path = os.environ.get("CMCCC_DATASET", "./data/tmp_data")
47 |     pipe = CYCLONES_PATH / "pipeline.yaml"
48 |     train = CYCLONES_PATH / "train.py"
49 | 
50 |     cmd = (
51 |         f"{tf_env}/bin/python {train.resolve()} -p {pipe.resolve()} --data_path {dataset_path}"
52 |     )
53 |     subprocess.run(cmd.split(), check=True, cwd=tmp_path)
54 | 


--------------------------------------------------------------------------------
/use-cases/cyclones/pipeline.yaml:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Roman Machacek <roman.machacek@cern.ch> - CERN
 8 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 9 | # --------------------------------------------------------------------------------------
10 | 
11 | # General configuration
12 | epochs: 3
13 | micro_batch_size: 32
14 | dataset_url: https://drive.google.com/drive/folders/1TnmujO4T-8_j4bCxqNe5HEw9njJIIBQD #https://drive.google.com/drive/folders/15DEq33MmtRvIpe2bNCg44lnfvEiHcPaf
15 | dataset_root: tmp_cyclones_data
16 | verbose: auto
17 | global_config: null
18 | 
19 | # Workflows
20 | training_pipeline:
21 |   class_path: itwinai.pipeline.Pipeline
22 |   init_args:
23 |     steps:
24 |       download-step:
25 |         class_path: dataloader.CyclonesDataGetter
26 |         init_args:
27 |           dataset_url: ${dataset_url}
28 |           dataset_root: ${dataset_root}
29 |           global_config: ${global_config}
30 |           patch_type: NEAREST
31 |           shuffle: False
32 |           split_ratio: [0.75, 0.25]
33 |           augment: True
34 |           epochs: ${epochs}
35 |           target_scale: False
36 |           label_no_cyclone: NONE
37 |           aug_type: ONLY_TCS
38 |           experiment: {
39 |             'DRV_VARS_1': ['fg10', 'msl', 't_500', 't_300'],
40 |             'COO_VARS_1': ['patch_cyclone'],
41 |             'MSK_VAR_1': None
42 |           }
43 |           
44 |       training-step:
45 |         class_path: trainer.CyclonesTrainer
46 |         init_args:
47 |           epochs: ${epochs}
48 |           micro_batch_size: ${micro_batch_size}
49 |           global_config: ${global_config}
50 |           network: VGG_V1
51 |           activation: LINEAR
52 |           regularization_strength: NONE
53 |           learning_rate: 0.0001
54 |           loss: MAE
55 |           verbose: ${verbose}


--------------------------------------------------------------------------------
/use-cases/lattice-qcd/config.yaml:
--------------------------------------------------------------------------------
 1 | # General configuration
 2 | batch_size: 128
 3 | epochs: 100
 4 | optim_lr: 0.001
 5 | weight_decay: 0.01
 6 | knots_len: 10
 7 | symmetric: True
 8 | shape: [1]
 9 | kappa: 0
10 | m_sq: -1.2
11 | lambd: 0.5
12 | ckpt_disp: False
13 | save_every: None
14 | optimizer_class: torch.optim.AdamW
15 | loss_fn: None
16 | scheduler: None
17 | print_stride: 10
18 | print_batch_size: 1024
19 | snapshot_path: null
20 | epochs_run: 0
21 | strategy: 'ddp'
22 | 
23 | training_pipeline:
24 |   _target_: itwinai.pipeline.Pipeline
25 |   steps:
26 |     - _target_: normflow.Fitter
27 |       model:
28 |         _target_: normflow.Model
29 |         net_:
30 |           _target_: normflow.nn.DistConvertor_
31 |           knots_len: ${knots_len}
32 |           symmetric: ${symmetric}
33 |         prior:
34 |           _target_: normflow.prior.NormalPrior
35 |           shape: ${shape}
36 |         action:
37 |           _target_: normflow.action.ScalarPhi4Action
38 |           kappa: ${kappa}
39 |           m_sq: ${m_sq}
40 |           lambd: ${lambd}
41 |       config:
42 |         optim_lr: ${optim_lr}
43 |         weight_decay: ${weight_decay}
44 |         save_every: ${save_every}
45 |         ckpt_disp: ${ckpt_disp}
46 |         batch_size: ${batch_size}
47 |         optimizer_class: ${optimizer_class}
48 |         scheduler: ${scheduler}
49 |         loss_fn: ${loss_fn}
50 |         print_stride: ${print_stride}
51 |         print_batch_size: ${print_batch_size}
52 |         snapshot_path: ${snapshot_path}
53 |         epochs_run: ${epochs_run}
54 |       epochs: ${epochs}
55 |       strategy: ${strategy}
56 |       measure_epoch_time: False
57 |       measure_gpu_data: False
58 |       enable_torch_profiling: False
59 |       logger:
60 |         _target_: itwinai.loggers.LoggersCollection
61 |         loggers:
62 |           - _target_: itwinai.loggers.ConsoleLogger
63 |             log_freq: 1
64 |           - _target_: itwinai.loggers.MLFlowLogger
65 |             experiment_name: Normalizing flows (ETHZ/CSIC)
66 |             log_freq: batch
67 | 


--------------------------------------------------------------------------------
/use-cases/eurac/data.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List, Tuple
 2 | import xarray as xr
 3 | from itwinai.components import DataSplitter, monitor_exec
 4 | 
 5 | from hython.scaler import Scaler
 6 | from hython.datasets import get_dataset
 7 | from hython.datasets.wflow_sbm import WflowSBM
 8 | from hython.config import Config
 9 | 
10 | 
11 | class RNNDatasetGetterAndPreprocessor(DataSplitter):
12 |     def __init__(
13 |         self,
14 |         # == common ==
15 |         hython_trainer: str,
16 |         dataset: str,
17 |         data_lazy_load: bool,
18 |         scaling_variant: str,
19 |         scaling_use_cached: bool,
20 |         experiment_name: str,
21 |         experiment_run: str,
22 |         data_source: dict,
23 |         work_dir: str,
24 |         dynamic_inputs: List[str] | None = None,
25 |         static_inputs: List[str] | None = None,
26 |         target_variables: List[str] | None = None,
27 |         scaling_static_range: Dict | None = None,
28 |         mask_variables: List[str] | None = None,
29 |         static_inputs_mask: List[str] | None = None,
30 |         head_model_inputs: List[str] | None = None,
31 |         train_temporal_range: List[str] = None,
32 |         valid_temporal_range: List[str] = None,
33 |         train_downsampler: Dict | None = None,
34 |         valid_downsampler: Dict | None = None,
35 |         downsampling_temporal_dynamic: bool | None = None,
36 |         min_sample_target: int | None = None,
37 |         seq_length: int | None = None
38 |     ) -> None:
39 |         self.save_parameters(**self.locals2params(locals()))
40 | 
41 |     @monitor_exec
42 |     def execute(self) -> Tuple[WflowSBM, WflowSBM, None]:
43 |         cfg = Config()
44 | 
45 |         for i in self.parameters:
46 |             setattr(cfg, i, self.parameters[i])
47 | 
48 |         scaler = Scaler(cfg, cfg.scaling_use_cached)
49 | 
50 |         train_dataset = get_dataset(cfg.dataset)(cfg, scaler, True, "train")
51 | 
52 |         val_dataset = get_dataset(cfg.dataset)(cfg, scaler, False, "valid")
53 | 
54 |         return train_dataset, val_dataset, None
55 | 


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-kubeflow-1/cpu.yaml:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | apiVersion: "kubeflow.org/v1"
11 | kind: PyTorchJob
12 | metadata:
13 |   name: torchrun-cpu
14 | spec:
15 |   # This property assumes that each pod runs on a separate node,
16 |   # and is propagated to torchrun as its --nproc-per-node argument
17 |   nprocPerNode: "2"
18 |   pytorchReplicaSpecs:
19 |     Master:
20 |       # Usually only one Master pod is used 
21 |       replicas: 1
22 |       restartPolicy: OnFailure
23 |       template:
24 |         spec:
25 |           containers:
26 |             - name: pytorch
27 |               image: registry.cern.ch/itwinai/dist-ml/itwinai-slim:0.0.10
28 |               command:
29 |                 - "torchrun"
30 |                 - "/app/train-cpu.py"
31 |                 - "--force-dist"
32 |               resources:
33 |                 # Requests help to implicitly make sure that each pod is running
34 |                 # in a separate node.
35 |                 requests:
36 |                   cpu: 1500m
37 |                 limits:
38 |                   cpu: 1500m
39 |                   memory: 2500Mi
40 |     Worker:
41 |       # The number of worker pods 
42 |       replicas: 1
43 |       restartPolicy: OnFailure
44 |       template:
45 |         spec:
46 |           containers:
47 |             - name: pytorch
48 |               image: registry.cern.ch/itwinai/dist-ml/itwinai-slim:0.0.10
49 |               command:
50 |                 - "torchrun"
51 |                 - "/app/train-cpu.py"
52 |                 - "--force-dist"
53 |               resources:
54 |                 requests:
55 |                   cpu: 1500m
56 |                 limits:
57 |                   cpu: 1500m
58 |                   memory: 2500Mi
59 | 


--------------------------------------------------------------------------------
/tutorials/distributed-ml/tf-tutorial-0-basics/tfmirrored_slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # general configuration of the job
 4 | #SBATCH --job-name=TFTest
 5 | #SBATCH --account=intertwin
 6 | #SBATCH --mail-user=
 7 | #SBATCH --mail-type=ALL
 8 | #SBATCH --output=job.out
 9 | #SBATCH --error=job.err
10 | #SBATCH --time=00:15:00
11 | 
12 | # configure node and process count on the CM
13 | #SBATCH --partition=batch
14 | #SBATCH --nodes=2
15 | #SBATCH --ntasks-per-node=1
16 | #SBATCH --cpus-per-task=32
17 | #SBATCH --gpus-per-node=4
18 | #SBATCH --exclusive
19 | 
20 | # gres options have to be disabled for deepv
21 | #SBATCH --gres=gpu:4
22 | 
23 | set -x
24 | unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
25 | 
26 | # set modules
27 | ml --force purge
28 | ml Stages/2024 GCC/12.3.0 OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py CMake cuDNN/8.9.5.29-CUDA-12
29 | 
30 | # set env - change to location of your environment
31 | source itwinai/envAItf_hdfml/bin/activate
32 | 
33 | # Using legacy (2.16) version of Keras
34 | # Latest version with TF (2.16) installs Keras 3.3
35 | # which returns an error for multi-node execution
36 | export TF_USE_LEGACY_KERAS=1
37 |  
38 | # sleep a sec
39 | sleep 1
40 | 
41 | # job info
42 | echo "DEBUG: TIME: $(date)"
43 | echo "DEBUG: EXECUTE: $EXEC"
44 | echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR"
45 | echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID"
46 | echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
47 | echo "DEBUG: SLURM_NNODES: $SLURM_NNODES"
48 | echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS"
49 | echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
50 | echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
51 | echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME"
52 | echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
53 | echo "DEBUG: SLURM_NODELIST: $SLURM_NODELIST"
54 | echo
55 | 
56 | # set comm
57 | export CUDA_VISIBLE_DEVICES="0,1,2,3"
58 | export OMP_NUM_THREADS=1
59 | if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then
60 |   export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
61 | fi
62 | 
63 | COMMAND="train.py"
64 | 
65 | EXEC="$COMMAND "
66 | 
67 | srun python -u $EXEC
68 | 


--------------------------------------------------------------------------------
/use-cases/3dgan/slurm.vega.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # SLURM jobscript for Vega systems
 4 | 
 5 | # Job configuration
 6 | #SBATCH --job-name=3dgan_training
 7 | #SBATCH --account=s24r05-03-users
 8 | #SBATCH --mail-user=
 9 | #SBATCH --mail-type=ALL
10 | #SBATCH --output=job.out
11 | #SBATCH --error=job.err
12 | #SBATCH --time=01:00:00
13 | 
14 | # Resources allocation
15 | #SBATCH --partition=gpu
16 | #SBATCH --nodes=2
17 | #SBATCH --gpus-per-node=4
18 | #SBATCH --cpus-per-gpu=4
19 | #SBATCH --ntasks-per-node=1
20 | # SBATCH --mem-per-gpu=10G
21 | #SBATCH --exclusive
22 | 
23 | # gres options have to be disabled for deepv
24 | #SBATCH --gres=gpu:4
25 | 
26 | echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR"
27 | echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID"
28 | echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
29 | echo "DEBUG: SLURM_NNODES: $SLURM_NNODES"
30 | echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS"
31 | echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
32 | echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
33 | echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME"
34 | echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
35 | 
36 | # ml --force purge
37 | # ml Python CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/11.7
38 | # ml GCCcore/11.3.0 NCCL/2.12.12-GCCcore-11.3.0-CUDA-11.7.0 cuDNN
39 | 
40 | ml Python
41 | module unload OpenSSL
42 | 
43 | source ~/.bashrc
44 | 
45 | # Activate the environment
46 | source ../../.venv-pytorch/bin/activate
47 | 
48 | GAN_DATASET="exp_data" #"/ceph/hpc/data/st2301-itwin-users/egarciagarcia"
49 | 
50 | # launch training
51 | TRAINING_CMD="$(which itwinai) exec-pipeline num_nodes=$SLURM_NNODES \
52 |     dataset_location=$GAN_DATASET "
53 | 
54 | srun --cpu-bind=none --ntasks-per-node=1 \
55 |     bash -c "torchrun \
56 |     --log_dir='logs_torchrun' \
57 |     --nnodes=$SLURM_NNODES \
58 |     --nproc_per_node=$SLURM_GPUS_PER_NODE \
59 |     --rdzv_id=$SLURM_JOB_ID \
60 |     --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \
61 |     --rdzv_backend=c10d \
62 |     --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)':29500 \
63 |     $TRAINING_CMD "


--------------------------------------------------------------------------------
/tests/components/conftest.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | import pytest
11 | 
12 | pytest.PIPE_LIST_YAML = """
13 | my-list-pipeline:
14 |   _target_: itwinai.pipeline.Pipeline
15 |   steps:
16 |     - _target_: itwinai.tests.dummy_components.FakePreproc
17 |       max_items: 33
18 |       name: my-preproc
19 | 
20 |     - _target_: itwinai.tests.dummy_components.FakeTrainer
21 |       lr: 0.001
22 |       batch_size: 32
23 |       name: my-trainer
24 | """
25 | 
26 | pytest.PIPE_DICT_YAML = """
27 | my-dict-pipeline:
28 |   _target_: itwinai.pipeline.Pipeline
29 |   steps:
30 |     preproc-step:
31 |       _target_: itwinai.tests.dummy_components.FakePreproc
32 |       max_items: 33
33 |       name: my-preproc
34 | 
35 |     train-step:
36 |       _target_: itwinai.tests.dummy_components.FakeTrainer
37 |       lr: 0.001
38 |       batch_size: 32
39 |       name: my-trainer
40 | """
41 | 
42 | pytest.NESTED_PIPELINE = """
43 | some:
44 |   field:
45 |     my-nested-pipeline:
46 |       _target_: itwinai.pipeline.Pipeline
47 |       steps:
48 |         - _target_: itwinai.tests.dummy_components.FakePreproc
49 |           max_items: 33
50 |           name: my-preproc
51 | 
52 |         - _target_: itwinai.tests.dummy_components.FakeTrainer
53 |           lr: 0.001
54 |           batch_size: 32
55 |           name: my-trainer
56 | """
57 | 
58 | pytest.INTERPOLATED_VALUES_PIPELINE = """
59 | max_items: 33
60 | name: my-trainer
61 | lr: 0.001
62 | my-interpolation-pipeline:
63 |   _target_: itwinai.pipeline.Pipeline
64 |   steps:
65 |     - _target_: itwinai.tests.dummy_components.FakePreproc
66 |       max_items: ${max_items}
67 |       name: my-preproc
68 | 
69 |     - _target_: itwinai.tests.dummy_components.FakeTrainer
70 |       lr: ${lr}
71 |       batch_size: 32
72 |       name: ${name}
73 | """
74 | 


--------------------------------------------------------------------------------
/docs/installation/software_prerequisites.rst:
--------------------------------------------------------------------------------
 1 | Setting up the system dependencies
 2 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 3 | First of all, before installing itwinai and its Python dependencies let's make sure that the
 4 | system dependencies such as CUDA drivers, compilers, and MPI libraries, are correctly set up.
 5 | 
 6 | Supported OSs are Linux and macOS.
 7 | 
 8 | .. warning::
 9 | 
10 |    On high-performance computing (HPC) systems, **you must load the appropriate modules
11 |    before creating or activating your Python virtual environment** to ensure compatibility with
12 |    system libraries. 
13 | 
14 | .. include:: ./hpc_modules.rst
15 | 
16 | 
17 | Creating a Python Virtual Environment
18 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
19 | The suggested way of managing Python dependencies, including itwinai, is through Python virtual
20 | environments. Creating a virtual environment is allows to isolate dependencies and prevent
21 | conflicts with other Python projects.
22 | 
23 | Beware that some HPC centers advise against using Python virtual environments as they create a
24 | large amount of files, which can clog some distributed filesystems. In such situation, you
25 | should prefer using containers.
26 | 
27 | To manage python virtual environments we use UV, which can be installed from
28 | `this page <https://docs.astral.sh/uv/getting-started/installation/>`_. Learn more on UV
29 | package manager from our `UV tutorial <uv_tutorial.rst>`_
30 | 
31 | If you don't already have a virtual environment, you can create one with the following
32 | command:
33 | 
34 | .. code-block:: bash
35 | 
36 |    # Remember to load the software modules first (see section above)!
37 |    
38 |    uv venv
39 | 
40 |    # Alternatively to the command above, if you just want to use plain pip instead of UV
41 |    python -m venv .venv
42 | 
43 | Notice that a new directory called ``.venv`` is created to contain your virtual
44 | environment. Now, you can start your virtual environment with the following command: 
45 | 
46 | .. code-block:: bash 
47 | 
48 |    # Remember to load the software modules first (see section above)!
49 | 
50 |    source .venv/bin/activate
51 | 


--------------------------------------------------------------------------------
/.github/workflows/pytest.yml:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | name: Testing with pytest
11 | 
12 | on:
13 |   pull_request:
14 |     branches: [main]
15 | 
16 | jobs:
17 |   test-torch:
18 |     name: Testing with pytest
19 |     runs-on: ubuntu-latest
20 |     steps:
21 | 
22 |       # Uncomment this only if you run out of disk space!
23 |       # - name: Maximize Disk Space
24 |       #   uses: easimon/maximize-build-space@v10
25 |       #   with:
26 |       #     # Reserve space on root for docker/dagger cache
27 |       #     build-mount-path: /docker
28 |       #     root-reserve-mb: 2048
29 |       #     overprovision-lvm: false
30 |       #     swap-size-mb: 4096
31 |       #     remove-dotnet: true
32 |       #     remove-android: true
33 |       #     remove-haskell: true
34 |       #     remove-codeql: true
35 | 
36 |       - uses: actions/checkout@v6
37 |       
38 |       #  ALSO uncomment this only if you run out of disk space!
39 |       # - name: Move Docker directory
40 |       #   shell: bash
41 |       #   run: |
42 |       #     sudo mv /var/lib/docker /docker/ &&
43 |       #     sudo ln -s /docker/docker /var/lib/docker &&
44 |       #     sudo systemctl restart docker
45 | 
46 |       # Run tests with pytest in a container
47 |       - name: Run Integration Test (development pipeline)
48 |         uses: dagger/dagger-for-github@v7
49 |         with:
50 |           workdir: ci
51 |           verb: call
52 |           args: >-
53 |             build-container
54 |             --context ..
55 |             --dockerfile ../env-files/torch/skinny.Dockerfile
56 |             test-local
57 |             --cmd "pytest,-v,--disable-warnings,-n,logical,/app/tests/,--dist,loadfile,-m,not hpc and not tensorflow"
58 |             logs
59 |           cloud-token: ${{ secrets.DAGGER_CLOUD_TOKEN }}
60 |           version: latest
61 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
  1 | **/TODO
  2 | **/mamba*
  3 | pl-training.yml
  4 | .vscode
  5 | 
  6 | # Project folders/files
  7 | # use-cases
  8 | workflows
  9 | CHANGELOG
 10 | 
 11 | # Docs
 12 | docs
 13 | 
 14 | # interLink pods
 15 | **/interLink
 16 | **/interlink
 17 | 
 18 | # Data
 19 | **/MNIST
 20 | **/*-predictions/
 21 | **/*-data/
 22 | **/*.tar.gz
 23 | **/exp_data
 24 | 
 25 | # Logs
 26 | **/logs
 27 | **/lightning_logs
 28 | **/mlruns
 29 | **/.logs
 30 | **/mllogs
 31 | **/nohup*
 32 | **/*.out
 33 | **/*.err
 34 | **/checkpoints/
 35 | **/*_logs
 36 | **/tmp*
 37 | **/.tmp*
 38 | 
 39 | # Markdown
 40 | **/*.md
 41 | 
 42 | # Custom envs
 43 | **/.venv*
 44 | 
 45 | # Git
 46 | .git
 47 | .gitignore
 48 | .github
 49 | 
 50 | # CI
 51 | .codeclimate.yml
 52 | .travis.yml
 53 | .taskcluster.yml
 54 | 
 55 | # Docker
 56 | docker-compose.yml
 57 | .docker
 58 | .dockerignore
 59 | Dockerfile
 60 | 
 61 | # Byte-compiled / optimized / DLL files
 62 | **/__pycache__/
 63 | **/*.py[cod]
 64 | 
 65 | # C extensions
 66 | *.so
 67 | 
 68 | # Distribution / packaging
 69 | .Python
 70 | env/
 71 | build/
 72 | develop-eggs/
 73 | dist/
 74 | downloads/
 75 | **/eggs/
 76 | lib/
 77 | lib64/
 78 | parts/
 79 | sdist/
 80 | var/
 81 | **/*.egg-info/
 82 | **/.installed.cfg
 83 | **/*.egg
 84 | 
 85 | # PyInstaller
 86 | #  Usually these files are written by a python script from a template
 87 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 88 | *.manifest
 89 | *.spec
 90 | 
 91 | # Installer logs
 92 | pip-log.txt
 93 | pip-delete-this-directory.txt
 94 | 
 95 | # Unit test / coverage reports
 96 | htmlcov/
 97 | .tox/
 98 | .coverage
 99 | .cache
100 | nosetests.xml
101 | coverage.xml
102 | 
103 | # Translations
104 | *.mo
105 | *.pot
106 | 
107 | # Django stuff:
108 | *.log
109 | 
110 | # Sphinx documentation
111 | docs/_build/
112 | 
113 | # PyBuilder
114 | target/
115 | 
116 | # Virtual environment
117 | .env/
118 | .venv/
119 | venv/
120 | 
121 | # PyCharm
122 | .idea
123 | 
124 | # Python mode for VIM
125 | .ropeproject
126 | */.ropeproject
127 | */*/.ropeproject
128 | */*/*/.ropeproject
129 | 
130 | # Vim swap files
131 | *.swp
132 | */*.swp
133 | */*/*.swp
134 | */*/*/*.swp


--------------------------------------------------------------------------------
/tutorials/distributed-ml/tf-scaling-test-jube/jube_ddp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # general configuration of the job
 4 | #SBATCH --job-name=JUBE_DDP
 5 | #SBATCH --account=#ACC#
 6 | #SBATCH --mail-user=
 7 | #SBATCH --mail-type=ALL
 8 | #SBATCH --output=job.out
 9 | #SBATCH --error=job.err
10 | #SBATCH --time=#TIMELIM#
11 | 
12 | # configure node and process count on the CM
13 | #SBATCH --partition=#QUEUE#
14 | #SBATCH --nodes=#NODES#
15 | #SBATCH --cpus-per-task=#NW#
16 | #SBATCH --gpus-per-node=#NGPU#
17 | #SBATCH --exclusive
18 | 
19 | # gres options have to be disabled for deepv
20 | #SBATCH --gres=gpu:4
21 | 
22 | set -x
23 | unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
24 | 
25 | # set modules
26 | ml --force purge
27 | ml Stages/2024 GCC/12.3.0 OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py CMake  cuDNN/8.9.5.29-CUDA-12
28 | 
29 | # set env
30 | source /p/project/intertwin/rakesh/repo_push/itwinai/envAItf_hdfml/bin/activate
31 | 
32 | # Using legacy (2.16) version of Keras
33 | # Latest version with TF (2.16) installs Keras 3.3
34 | # which returns an error for multi-node execution
35 | export TF_USE_LEGACY_KERAS=1
36 | 
37 | # sleep a sec
38 | sleep 1
39 | 
40 | # job info
41 | echo "DEBUG: TIME: $(date)"
42 | echo "DEBUG: EXECUTE: $EXEC"
43 | echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR"
44 | echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID"
45 | echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
46 | echo "DEBUG: SLURM_NNODES: $SLURM_NNODES"
47 | echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS"
48 | echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
49 | echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
50 | echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME"
51 | echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
52 | echo "DEBUG: SLURM_NODELIST: $SLURM_NODELIST"
53 | echo
54 | 
55 | # set comm
56 | export CUDA_VISIBLE_DEVICES="0,1,2,3"
57 | export OMP_NUM_THREADS=1
58 | if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then
59 |   export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
60 | fi
61 | 
62 | dataDir='/p/scratch/intertwin/datasets/imagenet/'
63 | 
64 | COMMAND="train.py"
65 | 
66 | EXEC="$COMMAND \
67 |     --data_dir $dataDir"
68 | 
69 | srun python -u $EXEC
70 | 
71 | 
72 | #eof
73 | 


--------------------------------------------------------------------------------
/tutorials/distributed-ml/tf-tutorial-1-imagenet/tfmirrored_slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # general configuration of the job
 4 | #SBATCH --job-name=TFTest
 5 | #SBATCH --account=intertwin
 6 | #SBATCH --mail-user=
 7 | #SBATCH --mail-type=ALL
 8 | #SBATCH --output=job.out
 9 | #SBATCH --error=job.err
10 | #SBATCH --time=01:00:00
11 | 
12 | # configure node and process count on the CM
13 | #SBATCH --partition=batch
14 | #SBATCH --nodes=4
15 | #SBATCH --ntasks-per-node=1
16 | #SBATCH --cpus-per-task=32
17 | #SBATCH --gpus-per-node=4
18 | #SBATCH --exclusive
19 | 
20 | # gres options have to be disabled for deepv
21 | #SBATCH --gres=gpu:4
22 | 
23 | set -x
24 | unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
25 | 
26 | # set modules
27 | ml --force purge
28 | ml Stages/2024 GCC/12.3.0 OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py CMake cuDNN/8.9.5.29-CUDA-12
29 | 
30 | # set env
31 | source /p/project/intertwin/rakesh/repo_push/itwinai/envAItf_hdfml/bin/activate
32 | 
33 | # Using legacy (2.16) version of Keras
34 | # Latest version with TF (2.16) installs Keras 3.3
35 | # which returns an error for multi-node execution
36 | export TF_USE_LEGACY_KERAS=1
37 | 
38 | # sleep a sec
39 | sleep 1
40 | 
41 | # job info
42 | echo "DEBUG: TIME: $(date)"
43 | echo "DEBUG: EXECUTE: $EXEC"
44 | echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR"
45 | echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID"
46 | echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
47 | echo "DEBUG: SLURM_NNODES: $SLURM_NNODES"
48 | echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS"
49 | echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
50 | echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
51 | echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME"
52 | echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
53 | echo "DEBUG: SLURM_NODELIST: $SLURM_NODELIST"
54 | echo
55 | 
56 | # set comm
57 | export CUDA_VISIBLE_DEVICES="0,1,2,3"
58 | export OMP_NUM_THREADS=1
59 | if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then
60 |   export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
61 | fi
62 | 
63 | dataDir='/p/scratch/intertwin/datasets/imagenet/'
64 | 
65 | COMMAND="train.py"
66 | 
67 | EXEC="$COMMAND \
68 |     --data_dir $dataDir"
69 | 
70 | srun python -u $EXEC
71 | 


--------------------------------------------------------------------------------
/docs/tutorials/tutorials.rst:
--------------------------------------------------------------------------------
 1 | .. _tutorials:
 2 | 
 3 | 
 4 | 
 5 | .. _distributed-training-tutorials:
 6 | 
 7 | Distributed machine learning training
 8 | ======================================
 9 | 
10 | Here you can find a collection of tutorials for distributing PyTorch and Tensorflow based workflows.
11 | 
12 | 
13 | Distributed ML with PyTorch
14 | ---------------------------
15 | 
16 | .. toctree::
17 |    :maxdepth: 1
18 |    :numbered:
19 | 
20 |    distrib-ml/torch_tutorial_0_basics
21 |    distrib-ml/torch_tutorial_1_mnist
22 |    distrib-ml/torch_tutorial_2_trainer_class
23 |    distrib-ml/torch-tutorial-GAN
24 |    distrib-ml/torch_scaling_test
25 |    distrib-ml/torch-tutorial-containers
26 |    distrib-ml/torch_tutorial_kubeflow_1.rst
27 |    distrib-ml/kuberay-setup-tutorial.rst
28 | 
29 | 
30 | Distributed ML with TensorFlow
31 | ------------------------------
32 | 
33 | .. toctree::
34 |    :maxdepth: 1
35 |    :numbered:
36 | 
37 |    distrib-ml/tf_tutorial_0_basics
38 |    distrib-ml/tf_tutorial_1_imagenet
39 |    distrib-ml/tf_scaling_test
40 | 
41 | 
42 | .. _ml-workflows-tutorials:
43 | 
44 | Machine Learning Workflows
45 | ===========================
46 | 
47 | Here you can find a collection of tutorials for various complexity ML workflows.
48 | 
49 | .. toctree::
50 |    :maxdepth: 1
51 |    
52 |    workflows/01-pipeline-introduction/tutorial_0_basic_workflow
53 |    workflows/02-pipeline-configuration/tutorial_1_intermediate_workflow
54 |    workflows/03-dag-workflows/tutorial_2_advanced_workflow
55 |    workflows/04_itwinai_argparser
56 |  
57 | 
58 | .. _hpo-tutorials:
59 | 
60 | Hyperparameter Optimization 
61 | ===========================
62 | 
63 | This tutorial provides an overview of Hyperparameter Optimization (HPO) workflows.
64 | 
65 | .. toctree::
66 |    :maxdepth: 1
67 | 
68 |    hpo-workflows/hpo-torchtrainer-integration
69 | 
70 | 
71 | .. _profiling-tutorials:
72 | 
73 | Code Profiling and Optimization
74 | ===============================
75 | 
76 | Here you can find our tutorials on how to do profiling with **itwinai**:
77 | 
78 | .. toctree::
79 |    :maxdepth: 1
80 | 
81 |    profiling/profiling-overview
82 |    profiling/py-spy-profiling
83 |    profiling/py-spy-lattice-qcd-example
84 | 


--------------------------------------------------------------------------------
/use-cases/cyclones/startscript.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # general configuration of the job
 4 | #SBATCH --job-name=cyclones
 5 | #SBATCH --account=intertwin
 6 | #SBATCH --mail-user=
 7 | #SBATCH --mail-type=ALL
 8 | #SBATCH --output=job.out
 9 | #SBATCH --error=job.err
10 | #SBATCH --time=00:30:00
11 | 
12 | # configure node and process count on the CM
13 | #SBATCH --partition=batch
14 | #SBATCH --nodes=2
15 | #SBATCH --ntasks-per-node=1
16 | #SBATCH --cpus-per-task=4
17 | #SBATCH --gpus-per-node=4
18 | 
19 | # SBATCH --exclusive
20 | 
21 | # gres options have to be disabled for deepv
22 | #SBATCH --gres=gpu:4
23 | 
24 | set -x
25 | unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
26 | 
27 | # load modules
28 | ml --force purge
29 | ml Stages/2024 GCC/12.3.0 OpenMPI CUDA/12 MPI-settings/CUDA Python/3.11 HDF5 PnetCDF libaio mpi4py CMake cuDNN/8.9.5.29-CUDA-12
30 | 
31 | source ../../envAItf_hdfml/bin/activate
32 | 
33 | # job info
34 | echo "DEBUG: TIME: $(date)"
35 | echo "DEBUG: EXECUTE: $EXEC"
36 | echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR"
37 | echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID"
38 | echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
39 | echo "DEBUG: SLURM_NNODES: $SLURM_NNODES"
40 | echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS"
41 | echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
42 | echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
43 | echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME"
44 | echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
45 | echo "DEBUG: SLURM_NODELIST: $SLURM_NODELIST"
46 | echo
47 | 
48 | # ONLY IF TENSORFLOW >= 2.16:
49 | # # Using legacy (2.16) version of Keras
50 | # # Latest version with TF (2.16) installs Keras 3.3
51 | # # which returns an error for multi-node execution
52 | # export TF_USE_LEGACY_KERAS=1
53 | 
54 | # set comm
55 | export CUDA_VISIBLE_DEVICES="0,1,2,3"
56 | export OMP_NUM_THREADS=1
57 | if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then
58 |   export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
59 | fi
60 | 
61 | # ON LOGIN NODE download datasets:
62 | # ../../.venv-tf/bin/python train.py -p pipeline.yaml --download-only
63 | 
64 | # --data_path argument is optional, but on JSC we use the dataset we previously downloaded
65 | srun python train.py -p pipeline.yaml --data_path /p/project/intertwin/smalldata/cmcc


--------------------------------------------------------------------------------
/tutorials/hpo-workflows/fashion-mnist/config.yaml:
--------------------------------------------------------------------------------
 1 | hpo_training_pipeline:
 2 |   _target_: itwinai.pipeline.Pipeline
 3 |   steps:
 4 |     - _target_: data.FashionMNISTGetter
 5 |     - _target_: data.FashionMNISTSplitter
 6 |       train_proportion: 0.9
 7 |       validation_proportion: 0.1
 8 |       test_proportion: 0.0
 9 |     - _target_: trainer.FashionMNISTTrainer
10 | 
11 |       # In this case we have noting to pass to the TrainingConfiguration. Some of its fields
12 |       # will be overridden using the hyperparameters sampled from the search space by the tuner
13 |       config: null
14 | 
15 |       epochs: 2
16 | 
17 |       # For more info: https://docs.ray.io/en/latest/train/api/doc/ray.train.ScalingConfig.html
18 |       ray_scaling_config:
19 |         _target_: ray.train.ScalingConfig
20 |         num_workers: 1
21 |         use_gpu: true
22 |         resources_per_worker:
23 |           CPU: 8
24 |           GPU: 1
25 | 
26 |       # For more info: https://docs.ray.io/en/latest/tune/api/doc/ray.tune.TuneConfig.html
27 |       ray_tune_config:
28 |         _target_: ray.tune.TuneConfig
29 |         num_samples: 2
30 |         scheduler:
31 |           _target_: ray.tune.schedulers.ASHAScheduler
32 |           metric: loss  # name of the metric to optimize during HPO
33 |           mode: min
34 |           max_t: 10
35 |           grace_period: 5
36 |           reduction_factor: 4
37 |           brackets: 1
38 | 
39 |       # For more info: https://docs.ray.io/en/latest/tune/api/doc/ray.tune.RunConfig.html
40 |       ray_run_config:
41 |         _target_: ray.tune.RunConfig
42 |         storage_path: ${itwinai.cwd:}/ray_checkpoints
43 |         name: FashionMNIST-HPO-Experiment
44 | 
45 |       # For more info: https://docs.ray.io/en/latest/tune/api/search_space.html
46 |       ray_search_space:
47 |         batch_size:
48 |           type: choice
49 |           categories: [32, 64, 128]
50 |         learning_rate:
51 |           type: uniform
52 |           lower: 1e-5
53 |           upper: 1e-3
54 | 
55 |       strategy: ddp
56 |       logger:
57 |         _target_: itwinai.loggers.LoggersCollection
58 |         loggers:
59 |           - _target_: itwinai.loggers.MLFlowLogger
60 |             experiment_name: FashionMNIST HPO Experiment
61 |             log_freq: batch
62 | 


--------------------------------------------------------------------------------
/src/itwinai/slurm/slurm_script_configuration.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from pydantic import BaseModel
 4 | 
 5 | from itwinai.slurm.slurm_constants import SLURM_TEMPLATE
 6 | 
 7 | 
 8 | class SlurmScriptConfiguration(BaseModel):
 9 |     """Configuration object for the SLURM script. It contains all the settings for the
10 |     SLURM script such as which hardware you are requesting or for how long to run it.
11 |     As it allows for any ``pre_exec_command`` and ``exec_command``, it should work for
12 |     any SLURM script.
13 |     """
14 | 
15 |     # Settings for the SLURM configuration
16 |     job_name: str | None = None
17 |     account: str
18 |     partition: str
19 |     time: str
20 | 
21 |     std_out: Path | None = None
22 |     err_out: Path | None = None
23 | 
24 |     num_nodes: int
25 |     num_tasks_per_node: int
26 |     gpus_per_node: int
27 |     cpus_per_task: int
28 |     memory: str
29 |     exclusive: bool = False
30 | 
31 |     # Typically used to set up the environment before executing the command,
32 |     # e.g. "ml Python", "source .venv/bin/activate" etc.
33 |     pre_exec_command: str | None = None
34 | 
35 |     # Command to execute, typically an 'srun' command
36 |     exec_command: str | None = None
37 | 
38 |     def exclusive_line(self) -> str:
39 |         return "#SBATCH --exclusive" if self.exclusive else ""
40 | 
41 |     def generate_script(self) -> str:
42 |         """Uses the provided configuration parameters and formats a SLURM script with
43 |         the requested settings.
44 | 
45 |         Returns:
46 |             str: A string containing the SLURM script.
47 |         """
48 |         if (
49 |             self.std_out is None
50 |             or self.err_out is None
51 |             or self.job_name is None
52 |             or self.pre_exec_command is None
53 |             or self.exec_command is None
54 |         ):
55 |             raise ValueError(
56 |                 "SlurmScriptConfiguration has some fields set to None! Make sure to set all"
57 |                 " fields before generating script! Configuration was formatted as follows:\n"
58 |                 f"{repr(self)}"
59 |             )
60 | 
61 |         return SLURM_TEMPLATE.format_map(
62 |             self.model_dump() | {"exclusive_line": self.exclusive_line()}
63 |         )
64 | 


--------------------------------------------------------------------------------
/docs/installation/post_itwinai_installation.rst:
--------------------------------------------------------------------------------
 1 | .. note:: 
 2 |     If you want to use the Prov4ML logger, you need to install it explicitly since it is only
 3 |     available on GitHub:
 4 | 
 5 |     For systems with Nvidia GPUs:
 6 | 
 7 |     .. code-block:: bash
 8 | 
 9 |        uv pip install "prov4ml[nvidia]@git+https://github.com/matbun/ProvML@v0.0.2"
10 | 
11 |     For macOS:
12 | 
13 |     .. code-block:: bash
14 | 
15 |        uv pip install "prov4ml[apple]@git+https://github.com/matbun/ProvML@v0.0.2"
16 | 
17 | 
18 | Installing Horovod and Microsoft DeepSpeed
19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20 | If you also want to install Horovod and Microsoft DeepSpeed for distributed ML with
21 | PyTorch, then make sure to install them **after** ``itwinai``. You can choose if you
22 | want to do this with or without GPU (CUDA) support: 
23 | 
24 | .. tab-set:: 
25 | 
26 |     .. tab-item:: CPU
27 | 
28 |         .. code-block:: bash
29 | 
30 |             uv pip install --no-cache-dir --no-build-isolation git+https://github.com/horovod/horovod.git@3a31d93
31 |             uv pip install --no-cache-dir --no-build-isolation deepspeed==0.16.8
32 | 
33 |     
34 |     .. tab-item:: CUDA
35 | 
36 |         .. code-block:: bash
37 | 
38 |             curl -fsSL https://github.com/interTwin-eu/itwinai/raw/main/env-files/torch/install-horovod-deepspeed-cuda.sh | bash
39 | 
40 | 
41 | .. warning::
42 |    
43 |     Horovod requires ``CMake>=3.13`` and 
44 |     `other packages <https://horovod.readthedocs.io/en/latest/install_include.html#requirements>`_
45 |     Make sure to have them installed in your environment before proceeding.
46 | 
47 | 
48 | .. warning::
49 |    The installation of Horovod and DeepSpeed needs to be executed on a machine/node where GPUs
50 |    are available. On some HPC systems, such as the `JUWELS <https://apps.fz-juelich.de/jsc/hps/juwels/configuration.html>`_
51 |    system on JSC, GPUs **are not available on login nodes** (the host you connect to when you
52 |    SSH into the system), only on **compute nodes**. On the JUWELS system, run this command to
53 |    install DeepSpeed and Horovod directly **from the repository's root**:
54 | 
55 |         .. code-block:: bash
56 | 
57 |             curl -fsSL https://github.com/interTwin-eu/itwinai/raw/main/env-files/torch/horovod-deepspeed-JSC.slurm | sbatch
58 | 
59 | 


--------------------------------------------------------------------------------
/env-files/tensorflow/createEnvJSCTF.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | if [ ! -f "env-files/tensorflow/generic_tf.sh" ]; then
 5 |   echo "ERROR: env-tensorflow/torch/generic_tf.sh not found!"
 6 |   exit 1
 7 | fi
 8 | 
 9 | # set modules
10 | ml --force purge
11 | 
12 | # get sys info
13 | cDir=$PWD
14 | sysN="$(uname -n | cut -f2- -d.)"
15 | echo "system:${sysN}"
16 | echo
17 | 
18 | cont1=false
19 | if [ "$sysN" = 'hdfml' ] ; then
20 |   # NOTE: REFLECT THEM IN THE MAIN README! 
21 |   ml Stages/2024 GCC/12.3.0 OpenMPI CUDA/12 MPI-settings/CUDA Python/3.11 HDF5 PnetCDF libaio mpi4py CMake cuDNN/8.9.5.29-CUDA-12
22 |   cont1=true
23 | else
24 |   echo
25 |   echo 'unknown system detected'
26 |   echo 'canceling'
27 |   echo
28 | fi
29 | echo "modules loaded"
30 | echo
31 | 
32 | # get python version
33 | pver="$(python --version 2>&1 | awk '{print $2}' | cut -f1-2 -d.)"
34 | echo "python version is ${pver}"
35 | echo
36 | 
37 | if [ "$cont1" = true ] ; then
38 |   if [ -d "${cDir}/envAItf_${sysN}" ];then
39 |     echo 'env already exist'
40 |     echo
41 | 
42 |     source envAItf_${sysN}/bin/activate
43 |   else
44 |     # create env
45 |     python -m venv envAItf_${sysN}
46 | 
47 |     # get headers for pip
48 |     if [ -f "${cDir}/envAItf_${sysN}/bin/pip" ]; then
49 |       echo 'pip already exist'
50 |     else
51 |       cp "$(which pip)" $cDir/envAItf_${sysN}/bin/
52 |       ln -s $cDir/envAItf_${sysN}/bin/pip $cDir/envAItf_${sysN}/bin/pip${pver}
53 |       var="#!$cDir/envAItf_${sysN}/bin/python${pver}"
54 |       sed -i "1s|.*|$var|" $cDir/envAItf_${sysN}/bin/pip
55 |     fi
56 | 
57 |     # activate env
58 |     source envAItf_${sysN}/bin/activate
59 | 
60 |     echo "a new env is created in ${cDir}"
61 |     echo "activation is done via:"
62 |     echo "source ${cDir}/envAItf_${sysN}/bin/activate"
63 |   fi
64 | fi
65 | 
66 | # Install TF dependencies in env
67 | export ENV_NAME="envAItf_$sysN"
68 | bash env-files/tensorflow/generic_tf.sh
69 | source $ENV_NAME/bin/activate
70 | 
71 | # JUBE benchmarking environment
72 | if [ -f "${cDir}/envAI_${sysN}/bin/jube" ]; then
73 |   echo 'JUBE already installed'
74 | else
75 |   pip install --no-cache-dir http://apps.fz-juelich.de/jsc/jube/jube2/download.php?version=latest
76 | fi
77 | 
78 | # # get rest of the libraries$
79 | # if [ "$cont1" = true ] ; then
80 | #   pip install -r reqs_TF.txt #--ignore-installed
81 | # fi
82 | 
83 | 


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-tutorial-2-trainer-class/sample_code.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Jarl Sondre Sæther
 5 | #
 6 | # Credit:
 7 | # - Jarl Sondre Sæther <jarl.sondre.saether@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | """This file contains the sample code that was used for the snippets in the interTwin
11 | presentation held on Feb. 18. These code snippets are meant as outlines for how to use
12 | itwinai to simplify distributed ML.
13 | """
14 | 
15 | from itwinai.torch.distributed import TorchDDPStrategy
16 | from itwinai.torch.trainer import TorchTrainer
17 | 
18 | 
19 | # Included for the sake of linting
20 | def train(model):
21 |     pass
22 | 
23 | 
24 | ##############################################################################
25 | # Using itwinai's Strategy but not the TorchTrainer
26 | ##############################################################################
27 | 
28 | # Create and initialize strategy
29 | strategy = TorchDDPStrategy(backend="nccl")
30 | strategy.init()
31 | 
32 | # Create dataset as usual
33 | train_dataset = ...
34 | 
35 | # Use 'strategy' to create dataloader
36 | train_dataloader = strategy.create_dataloader(train_dataset, ...)
37 | 
38 | # Create model, optimizer and scheduler as usual
39 | model, optimizer, scheduler = ...
40 | 
41 | # Distribute them using 'strategy'
42 | model, optimizer, scheduler = strategy.distributed(model, optimizer, scheduler)
43 | 
44 | # Train model as usual
45 | train(model)  # Note: have to notify 'strategy' every time an epoch passes
46 | 
47 | # Clean up strategy at the end
48 | strategy.clean_up()
49 | ##############################################################################
50 | 
51 | 
52 | ##############################################################################
53 | # Using itwinai's TorchTrainer (which uses Strategy internally)
54 | ##############################################################################
55 | 
56 | # Create dataset as usual
57 | train_dataset = ...
58 | 
59 | # Create model as usual
60 | model = ...
61 | 
62 | trainer = TorchTrainer(config={}, model=model, strategy="ddp")
63 | 
64 | _, _, _, trained_model = trainer.execute(train_dataset, ...)
65 | ##############################################################################
66 | 


--------------------------------------------------------------------------------
/use-cases/mnist/torch/saver.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | """This module is used during inference to save predicted labels to file."""
11 | 
12 | import csv
13 | import os
14 | import shutil
15 | from typing import Dict, List, Optional
16 | 
17 | from itwinai.components import Saver, monitor_exec
18 | 
19 | 
20 | class TorchMNISTLabelSaver(Saver):
21 |     """Serializes to disk the labels predicted for MNIST dataset."""
22 | 
23 |     def __init__(
24 |         self,
25 |         save_dir: str = "mnist_predictions",
26 |         predictions_file: str = "predictions.csv",
27 |         class_labels: Optional[List] = None,
28 |     ) -> None:
29 |         super().__init__()
30 |         self.save_parameters(**self.locals2params(locals()))
31 |         self.save_dir = save_dir
32 |         self.predictions_file = predictions_file
33 |         self.class_labels = (
34 |             class_labels if class_labels is not None else [f"Digit {i}" for i in range(10)]
35 |         )
36 | 
37 |     @monitor_exec
38 |     def execute(
39 |         self,
40 |         predicted_classes: Dict[str, int],
41 |     ) -> Dict[str, int]:
42 |         """Translate predictions from class idx to class label and save
43 |         them to disk.
44 | 
45 |         Args:
46 |             predicted_classes (Dict[str, int]): maps unique item ID to
47 |                 the predicted class ID.
48 | 
49 |         Returns:
50 |             Dict[str, int]: predicted classes.
51 |         """
52 |         if os.path.exists(self.save_dir):
53 |             shutil.rmtree(self.save_dir)
54 |         os.makedirs(self.save_dir)
55 | 
56 |         # Map class idx (int) to class label (str)
57 |         predicted_labels = {
58 |             itm_name: self.class_labels[cls_idx]
59 |             for itm_name, cls_idx in predicted_classes.items()
60 |         }
61 | 
62 |         # Save to disk
63 |         filepath = os.path.join(self.save_dir, self.predictions_file)
64 |         with open(filepath, "w") as csv_file:
65 |             writer = csv.writer(csv_file)
66 |             for key, value in predicted_labels.items():
67 |                 writer.writerow([key, value])
68 |         return predicted_labels
69 | 


--------------------------------------------------------------------------------
/tutorials/distributed-ml/torch-tutorial-containers/config.yaml:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------------------
 2 | # Part of the interTwin Project: https://www.intertwin.eu/
 3 | #
 4 | # Created by: Matteo Bunino
 5 | #
 6 | # Credit:
 7 | # - Matteo Bunino <matteo.bunino@cern.ch> - CERN
 8 | # --------------------------------------------------------------------------------------
 9 | 
10 | # General config
11 | dataset_root: .tmp/
12 | num_classes: 10
13 | batch_size: 64
14 | num_workers_dataloader: 4
15 | pin_memory: False
16 | lr: 0.001
17 | momentum: 0.9
18 | fp16_allreduce: False
19 | use_adasum: False
20 | gradient_predivide_factor: 1.0
21 | epochs: 2
22 | strategy: ddp
23 | test_data_path: mnist-sample-data
24 | inference_model_mlflow_uri: mnist-pre-trained.pth
25 | predictions_dir: mnist-predictions
26 | predictions_file: predictions.csv
27 | class_labels: null
28 | 
29 | # Workflows configuration
30 | training_pipeline:
31 |   _target_: itwinai.pipeline.Pipeline
32 |   steps:
33 |     dataloading_step:
34 |       _target_: dataloader.MNISTDataModuleTorch
35 |       save_path: ${dataset_root}
36 | 
37 |       training_step:
38 |         _target_: itwinai.torch.trainer.TorchTrainer
39 |         config:
40 |           batch_size: ${batch_size}
41 |           num_workers: ${num_workers_dataloader}
42 |           pin_memory: ${pin_memory}
43 |           lr: ${lr}
44 |           momentum: ${momentum}
45 |           fp16_allreduce: ${fp16_allreduce}
46 |           use_adasum: ${use_adasum}
47 |           gradient_predivide_factor: ${gradient_predivide_factor}
48 | 
49 |           model:
50 |             _target_: model.Net
51 |           epochs: ${epochs}
52 |           metrics:
53 |             accuracy:
54 |               _target_: torchmetrics.classification.MulticlassAccuracy
55 |               num_classes: ${num_classes}
56 |             precision:
57 |               _target_: torchmetrics.classification.MulticlassPrecision
58 |               num_classes: ${num_classes}
59 |             recall:
60 |               _target_: torchmetrics.classification.MulticlassRecall
61 |               num_classes: ${num_classes}
62 |           logger:
63 |             _target_: itwinai.loggers.LoggersCollection
64 |             loggers:
65 |               - _target_: itwinai.loggers.ConsoleLogger
66 |                 log_freq: 10000
67 |               - _target_: itwinai.loggers.MLFlowLogger
68 |                 experiment_name: MNIST classifier
69 |                 log_freq: batch 
70 |         strategy: ${strategy}
71 |         # checkpoint_every: 1
72 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | # This CITATION.cff file was generated with cffinit.
 2 | # Visit https://bit.ly/cffinit to generate yours today!
 3 | 
 4 | cff-version: 1.2.0
 5 | title: itwinai
 6 | message: >-
 7 |   If you use this software, please cite it using the
 8 |   metadata from this file.
 9 | type: software
10 | 
11 | # The oder reflects the contributions to the repository to date
12 | # Also including supervisors who attended the meetings
13 | authors:
14 |   - given-names: Matteo
15 |     family-names: Bunino
16 |     email: matteo.bunino@cern.ch
17 |     affiliation: CERN
18 |     orcid: 'https://orcid.org/0009-0008-5100-9300'
19 |   - given-names: Rakesh
20 |     family-names: Sarma
21 |     email: r.sarma@fz-juelich.de
22 |     affiliation: FZ Jülich
23 |   - given-names: Jarl Sondre
24 |     family-names: Sæther
25 |     email: jarl.sondre.saether@cern.ch
26 |     affiliation: CERN
27 |   - given-names: Anna Elisa
28 |     family-names: Lappe
29 |     email: anna.elisa.lappe@cern.ch
30 |     affiliation: CERN
31 |   - given-names: Kalliopi
32 |     family-names: Tsolaki
33 |     email: kalliopi.tsolaki@cern.ch
34 |     affiliation: CERN
35 |   - given-names: Killian
36 |     family-names: Verder
37 |     email: killian.verder@cern.ch
38 |     affiliation: CERN
39 |   - given-names: Henry
40 |     family-names: Mutegeki
41 |     email: henry.mutegeki@cern.ch
42 |     affiliation: CERN
43 |   - given-names: Roman
44 |     family-names: Machacek
45 |     email: roman.machacek@cern.ch
46 |     affiliation: CERN
47 |   - given-names: Alexander
48 |     family-names: Zoechbauer
49 |     email: alexander.zoechbauer@cern.ch
50 |     affiliation: CERN
51 |   - given-names: Mario
52 |     family-names: Ruettgers
53 |     email: m.ruettgers@fz-juelich.de
54 |     affiliation: FZ Jülich
55 |   - given-names: Ilaria
56 |     family-names: Luise
57 |     email: ilaria.luise@cern.ch
58 |     affiliation: CERN
59 |   - given-names: Eric
60 |     family-names: Wulff
61 |     email: eric.wulff@cern.ch
62 |     affiliation: CERN
63 |   - given-names: Maria
64 |     family-names: Girone
65 |     email: maria.girone@cern.ch
66 |     affiliation: CERN
67 |   - given-names: Andreas
68 |     family-names: Lintermann
69 |     email: a.lintermann@fz-juelich.de
70 |     affiliation: FZ Jülich
71 | repository-code: 'https://github.com/interTwin-eu/itwinai'
72 | url: 'https://itwinai.readthedocs.io/'
73 | abstract: AI on cloud and HPC made simple for science
74 | keywords:
75 |   - Artificial intelligence
76 |   - Machine learning
77 |   - Digital twins
78 |   - Climate research
79 |   - Physics research
80 | license: MIT
81 | 


--------------------------------------------------------------------------------