├── .python-version ├── use-cases ├── virgo │ ├── .gitignore │ ├── requirements.txt │ ├── synthetic-data-gen │ │ └── data_generation_hdf5.sh │ └── slurm_config.yaml ├── cyclones │ ├── requirements.txt │ ├── .gitignore │ ├── src │ │ ├── strategy.py │ │ └── transform.py │ ├── README.md │ ├── pipeline.yaml │ └── startscript.sh ├── eurac │ ├── .gitignore │ ├── requirements.txt │ ├── slurm_config.yaml │ └── data.py ├── xtclim │ ├── src │ │ ├── .DS_Store │ │ ├── initialization.py │ │ └── utils.py │ ├── outputs │ │ └── .DS_Store │ ├── preprocessing │ │ └── .DS_Store │ ├── requirements.txt │ └── train.py ├── 3dgan │ ├── requirements.txt │ ├── Dockerfile │ ├── create_inference_sample.py │ ├── downsample_h5py_file.py │ ├── run-provenance-experiments.sh │ ├── slurm.jsc.sh │ └── slurm.vega.sh ├── radio-astronomy │ ├── .gitignore │ ├── clean │ ├── requirements.txt │ └── .pytest-clean ├── mnist │ ├── tensorflow │ │ ├── README.md │ │ ├── startscript.sh │ │ ├── pipeline.yaml │ │ └── dataloader.py │ ├── torch │ │ ├── slurm_config.yaml │ │ ├── Dockerfile │ │ ├── create_inference_sample.py │ │ └── saver.py │ └── torch-lightning │ │ ├── README.md │ │ └── startscript ├── README.md └── lattice-qcd │ ├── train.py │ ├── setup.py │ └── config.yaml ├── docs ├── requirements.txt ├── tutorials │ ├── workflows │ │ ├── 03-dag-workflows │ │ ├── 01-pipeline-introduction │ │ ├── 02-pipeline-configuration │ │ └── 04_itwinai_argparser.rst │ ├── distrib-ml │ │ ├── torch-tutorial-GAN.rst │ │ ├── torch_tutorial_0_basics.rst │ │ ├── tf_tutorial_1_imagenet.rst │ │ ├── torch_tutorial_1_mnist.rst │ │ ├── tf_tutorial_0_basics.rst │ │ ├── torch_tutorial_kubeflow_1.rst │ │ ├── tf_scaling_test.rst │ │ ├── kuberay-setup-tutorial.rst │ │ ├── torch_scaling_test.rst │ │ └── torch-tutorial-containers.rst │ └── tutorials.rst ├── getting-started │ ├── plugins.rst │ └── plugins-list.rst ├── images │ ├── icon-itwinai-orange.png │ ├── icon-itwinai-white.png │ ├── icon-itwinai-orange-white.png │ ├── icon-itwinai-orange-black-subtitle.png │ ├── icon-itwinai-orange-white-subtitle.png │ ├── icon-itwinai-orange-black-subtitle-small.png │ └── scalability-plots │ │ ├── mnist │ │ ├── absolute_epoch_time.png │ │ ├── computation_vs_other_plot.png │ │ └── relative_epoch_time_speedup.png │ │ ├── virgo │ │ ├── absolute_epoch_time.png │ │ ├── computation_vs_other_plot.png │ │ ├── outdated │ │ │ ├── gpu_energy_plot.png │ │ │ ├── utilization_plot.png │ │ │ ├── communication_plot.png │ │ │ ├── absolute_scalability_plot.png │ │ │ └── relative_scalability_plot.png │ │ └── relative_epoch_time_speedup.png │ │ └── eurac │ │ └── outdated │ │ ├── gpu_energy_plot.png │ │ ├── utilization_plot.png │ │ ├── communication_plot.png │ │ ├── absolute_scalability_plot.png │ │ └── relative_scalability_plot.png ├── how-it-works │ ├── loggers │ │ └── figures │ │ │ └── logger_fig.png │ ├── workflows │ │ └── figures │ │ │ ├── comp_Get.png │ │ │ ├── comp_Adapt.png │ │ │ ├── comp_Proc.png │ │ │ ├── comp_Save.png │ │ │ ├── comp_Split.png │ │ │ ├── comp_Train.png │ │ │ ├── Adapt_example.png │ │ │ ├── comp_Predict.png │ │ │ ├── simple_pipeline.png │ │ │ └── Advanced_workflow.png │ └── training │ │ ├── training.rst │ │ └── explain_ddp.rst ├── api │ ├── cli.md │ ├── itwinai.loggers.rst │ ├── itwinai.type.rst │ ├── itwinai.utils.rst │ ├── itwinai.parser.rst │ ├── itwinai.distributed.rst │ ├── itwinai.pipeline.rst │ ├── itwinai.components.rst │ ├── itwinai.serialization.rst │ ├── cli_reference.rst │ ├── modules.rst │ ├── itwinai.tf.modules.rst │ ├── itwinai.tests.modules.rst │ ├── itwinai.scalability_report.modules.rst │ └── itwinai.torch.modules.rst ├── _static │ └── custom.css ├── use-cases │ ├── xtclim_doc.rst │ ├── use_cases.rst │ ├── mnist_doc.rst │ ├── latticeqcd_doc.rst │ ├── cyclones_doc.rst │ └── 3dgan_doc.rst ├── Makefile ├── make.bat ├── testing-with-pytest.md └── installation │ ├── user_installation.rst │ ├── software_prerequisites.rst │ └── post_itwinai_installation.rst ├── tutorials ├── ml-workflows │ ├── .gitignore │ ├── 03-dag-workflows │ │ └── Advanced_workflow.png │ ├── 01-pipeline-introduction │ │ ├── sample_pipeline_1.jpg │ │ └── sample_pipeline_2.jpg │ └── 04-itwinai-argparser │ │ └── README.md ├── distributed-ml │ ├── torch-tutorial-1-mnist │ │ ├── .gitignore │ │ ├── slurm_config.yaml │ │ ├── config.yaml │ │ └── README.md │ ├── torch-tutorial-2-trainer-class │ │ ├── slurm_config.yaml │ │ ├── README.md │ │ ├── sample_srun.sh │ │ └── sample_code.py │ ├── torch-scaling-test │ │ ├── img │ │ │ └── report.png │ │ ├── config │ │ │ ├── ddp.yaml │ │ │ ├── deepspeed.yaml │ │ │ ├── horovod.yaml │ │ │ └── base.yaml │ │ └── slurm_config.yaml │ ├── torch-tutorial-0-basics │ │ └── slurm_config.yaml │ ├── torch-tutorial-GAN │ │ └── slurm_config.yaml │ ├── tf-tutorial-0-basics │ │ ├── README.md │ │ └── tfmirrored_slurm.sh │ ├── tf-tutorial-1-imagenet │ │ ├── README.md │ │ └── tfmirrored_slurm.sh │ ├── torch-tutorial-containers │ │ ├── model.py │ │ ├── run_docker.sh │ │ ├── runall.sh │ │ └── config.yaml │ ├── torch-kubeflow-1 │ │ ├── Dockerfile │ │ └── cpu.yaml │ └── tf-scaling-test-jube │ │ ├── README.md │ │ └── jube_ddp.sh └── hpo-workflows │ └── fashion-mnist │ └── config.yaml ├── .github ├── linters │ ├── .shellcheckrc │ ├── .isort.cfg │ ├── .jscpd.json │ ├── .flake8 │ ├── .markdownlint.json │ ├── mlc_config.json │ ├── .hadolint.yaml │ └── .ruff.toml ├── dependabot.yml ├── PULL_REQUEST_TEMPLATE.md ├── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── workflows │ ├── check-links.yml │ ├── sqaaas.yml │ ├── pypi.yml │ └── pytest.yml └── ISSUE_TEMPLATE.md ├── ci ├── .gitattributes ├── .gitignore ├── dagger.json ├── src │ └── main │ │ ├── literals.py │ │ ├── utils.py │ │ └── __init__.py └── pyproject.toml ├── env-files ├── torch │ ├── requirements │ │ ├── requirements.txt │ │ ├── cmcc-requirements.txt │ │ └── README.md │ ├── jupyter │ │ ├── ipython_kernel_config.json │ │ ├── start-cloud.sh │ │ ├── README.md │ │ ├── setup.sh │ │ └── asyncssh_config.py │ ├── horovod-deepspeed-JSC.slurm │ ├── createEnvVega.sh │ ├── README.md │ ├── install-horovod-deepspeed-cuda.sh │ ├── generic_torch.sh │ └── createEnvJSC.sh ├── docs │ ├── build-docs-jsc.sh │ └── create-docs-env-jsc.sh └── tensorflow │ ├── createEnvVegaTF.sh │ ├── generic_tf.sh │ └── createEnvJSCTF.sh ├── .gitmodules ├── setup.cfg ├── COPYRIGHT ├── THIRD_PARTY_LICENSES ├── src └── itwinai │ ├── plugins │ └── __init__.py │ ├── torch │ ├── __init__.py │ ├── data │ │ └── __init__.py │ ├── models │ │ └── __init__.py │ ├── type.py │ └── reproducibility.py │ ├── tensorflow │ ├── __init__.py │ ├── data │ │ └── __init__.py │ ├── models │ │ ├── __init__.py │ │ └── mnist.py │ └── utils.py │ ├── tests │ ├── exceptions.py │ └── __init__.py │ ├── type.py │ ├── slurm │ ├── sample_slurm_config.yaml │ ├── slurm_constants.py │ └── slurm_script_configuration.py │ └── constants.py ├── CHANGELOG ├── CODEOWNERS ├── .vscode ├── extensions.json └── settings.json ├── tests ├── test_cli.py ├── torch │ └── test_config.py ├── run_on_jsc.sh ├── conftest.py ├── use-cases │ ├── conftest.py │ └── test_cyclones.py └── components │ └── conftest.py ├── MAINTAINERS.md ├── .readthedocs.yaml ├── .dockerignore └── CITATION.cff /.python-version: -------------------------------------------------------------------------------- 1 | 3.12 2 | -------------------------------------------------------------------------------- /use-cases/virgo/.gitignore: -------------------------------------------------------------------------------- 1 | data/ -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | .[torch,docs] 2 | -------------------------------------------------------------------------------- /tutorials/ml-workflows/.gitignore: -------------------------------------------------------------------------------- 1 | *.yaml -------------------------------------------------------------------------------- /use-cases/cyclones/requirements.txt: -------------------------------------------------------------------------------- 1 | gdown -------------------------------------------------------------------------------- /.github/linters/.shellcheckrc: -------------------------------------------------------------------------------- 1 | disable=SC2148 2 | -------------------------------------------------------------------------------- /ci/.gitattributes: -------------------------------------------------------------------------------- 1 | /sdk/** linguist-generated 2 | -------------------------------------------------------------------------------- /use-cases/eurac/.gitignore: -------------------------------------------------------------------------------- 1 | tmp 2 | plots/ 3 | -------------------------------------------------------------------------------- /use-cases/cyclones/.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | experiments -------------------------------------------------------------------------------- /ci/.gitignore: -------------------------------------------------------------------------------- 1 | /sdk 2 | /.venv 3 | /**/__pycache__ 4 | /.env 5 | -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-tutorial-1-mnist/.gitignore: -------------------------------------------------------------------------------- 1 | MNIST 2 | -------------------------------------------------------------------------------- /env-files/torch/requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | # Addtional requirements go here 2 | -------------------------------------------------------------------------------- /.github/linters/.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | profile = black 3 | known_first_party = itwinai 4 | -------------------------------------------------------------------------------- /docs/tutorials/workflows/03-dag-workflows: -------------------------------------------------------------------------------- 1 | ../../../tutorials/ml-workflows/03-dag-workflows -------------------------------------------------------------------------------- /use-cases/virgo/requirements.txt: -------------------------------------------------------------------------------- 1 | gwpy 2 | h5py 3 | pandas 4 | scikit-learn 5 | matplotlib 6 | -------------------------------------------------------------------------------- /docs/tutorials/workflows/01-pipeline-introduction: -------------------------------------------------------------------------------- 1 | ../../../tutorials/ml-workflows/01-pipeline-introduction/ -------------------------------------------------------------------------------- /docs/tutorials/workflows/02-pipeline-configuration: -------------------------------------------------------------------------------- 1 | ../../../tutorials/ml-workflows/02-pipeline-configuration/ -------------------------------------------------------------------------------- /use-cases/xtclim/src/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/use-cases/xtclim/src/.DS_Store -------------------------------------------------------------------------------- /docs/getting-started/plugins.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../../tutorials/plugins/README.md 2 | :parser: myst_parser.sphinx_ 3 | -------------------------------------------------------------------------------- /.github/linters/.jscpd.json: -------------------------------------------------------------------------------- 1 | { 2 | "threshold": 2.0, 3 | "ignore": [ 4 | "**/itwinai/loggers.py" 5 | ] 6 | } -------------------------------------------------------------------------------- /docs/images/icon-itwinai-orange.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/icon-itwinai-orange.png -------------------------------------------------------------------------------- /docs/images/icon-itwinai-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/icon-itwinai-white.png -------------------------------------------------------------------------------- /use-cases/xtclim/outputs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/use-cases/xtclim/outputs/.DS_Store -------------------------------------------------------------------------------- /docs/images/icon-itwinai-orange-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/icon-itwinai-orange-white.png -------------------------------------------------------------------------------- /use-cases/xtclim/preprocessing/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/use-cases/xtclim/preprocessing/.DS_Store -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-tutorial-2-trainer-class/slurm_config.yaml: -------------------------------------------------------------------------------- 1 | training_cmd: "train.py" 2 | num_nodes: 2 3 | gpus_per_node: 4 4 | -------------------------------------------------------------------------------- /use-cases/3dgan/requirements.txt: -------------------------------------------------------------------------------- 1 | h5py>=3.7.0 2 | google>=3.0.0 3 | protobuf>=4.24.3 4 | gdown>=4.7.1 5 | # plotly>=5.18.0 6 | # kaleido>=0.2.1 -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "tutorials/plugins"] 2 | path = tutorials/plugins 3 | url = https://github.com/interTwin-eu/itwinai-plugin-template/ 4 | -------------------------------------------------------------------------------- /docs/how-it-works/loggers/figures/logger_fig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/loggers/figures/logger_fig.png -------------------------------------------------------------------------------- /docs/how-it-works/workflows/figures/comp_Get.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/workflows/figures/comp_Get.png -------------------------------------------------------------------------------- /docs/how-it-works/workflows/figures/comp_Adapt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/workflows/figures/comp_Adapt.png -------------------------------------------------------------------------------- /docs/how-it-works/workflows/figures/comp_Proc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/workflows/figures/comp_Proc.png -------------------------------------------------------------------------------- /docs/how-it-works/workflows/figures/comp_Save.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/workflows/figures/comp_Save.png -------------------------------------------------------------------------------- /docs/how-it-works/workflows/figures/comp_Split.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/workflows/figures/comp_Split.png -------------------------------------------------------------------------------- /docs/how-it-works/workflows/figures/comp_Train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/workflows/figures/comp_Train.png -------------------------------------------------------------------------------- /docs/images/icon-itwinai-orange-black-subtitle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/icon-itwinai-orange-black-subtitle.png -------------------------------------------------------------------------------- /docs/images/icon-itwinai-orange-white-subtitle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/icon-itwinai-orange-white-subtitle.png -------------------------------------------------------------------------------- /docs/how-it-works/workflows/figures/Adapt_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/workflows/figures/Adapt_example.png -------------------------------------------------------------------------------- /docs/how-it-works/workflows/figures/comp_Predict.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/workflows/figures/comp_Predict.png -------------------------------------------------------------------------------- /docs/how-it-works/workflows/figures/simple_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/workflows/figures/simple_pipeline.png -------------------------------------------------------------------------------- /docs/images/icon-itwinai-orange-black-subtitle-small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/icon-itwinai-orange-black-subtitle-small.png -------------------------------------------------------------------------------- /docs/how-it-works/workflows/figures/Advanced_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/how-it-works/workflows/figures/Advanced_workflow.png -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-scaling-test/img/report.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/tutorials/distributed-ml/torch-scaling-test/img/report.png -------------------------------------------------------------------------------- /use-cases/eurac/requirements.txt: -------------------------------------------------------------------------------- 1 | hython[complete] @ git+https://github.com/interTwin-eu/hython.git@main 2 | scikit-learn 3 | tqdm 4 | cf_xarray 5 | requests 6 | aiohttp 7 | -------------------------------------------------------------------------------- /docs/images/scalability-plots/mnist/absolute_epoch_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/mnist/absolute_epoch_time.png -------------------------------------------------------------------------------- /docs/images/scalability-plots/virgo/absolute_epoch_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/virgo/absolute_epoch_time.png -------------------------------------------------------------------------------- /tutorials/ml-workflows/03-dag-workflows/Advanced_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/tutorials/ml-workflows/03-dag-workflows/Advanced_workflow.png -------------------------------------------------------------------------------- /.github/linters/.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # https://black.readthedocs.io/en/stable/guides/using_black_with_other_tools.html#flake8 3 | extend-ignore = E203,W503 4 | max-line-length = 95 5 | -------------------------------------------------------------------------------- /docs/images/scalability-plots/eurac/outdated/gpu_energy_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/eurac/outdated/gpu_energy_plot.png -------------------------------------------------------------------------------- /docs/images/scalability-plots/eurac/outdated/utilization_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/eurac/outdated/utilization_plot.png -------------------------------------------------------------------------------- /docs/images/scalability-plots/mnist/computation_vs_other_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/mnist/computation_vs_other_plot.png -------------------------------------------------------------------------------- /docs/images/scalability-plots/virgo/computation_vs_other_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/virgo/computation_vs_other_plot.png -------------------------------------------------------------------------------- /docs/images/scalability-plots/virgo/outdated/gpu_energy_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/virgo/outdated/gpu_energy_plot.png -------------------------------------------------------------------------------- /docs/images/scalability-plots/virgo/outdated/utilization_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/virgo/outdated/utilization_plot.png -------------------------------------------------------------------------------- /env-files/torch/jupyter/ipython_kernel_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "IPKernelApp": { 3 | "extensions": [ 4 | "rucio_jupyterlab.kernels.ipython" 5 | ] 6 | } 7 | } -------------------------------------------------------------------------------- /use-cases/radio-astronomy/.gitignore: -------------------------------------------------------------------------------- 1 | syn_payload/ 2 | syn_param/ 3 | models/* 4 | scalability-metrics/ 5 | plots/* 6 | outputs/ 7 | mllogs/ 8 | checkpoints/ 9 | __pycache__/ 10 | -------------------------------------------------------------------------------- /docs/images/scalability-plots/eurac/outdated/communication_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/eurac/outdated/communication_plot.png -------------------------------------------------------------------------------- /docs/images/scalability-plots/mnist/relative_epoch_time_speedup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/mnist/relative_epoch_time_speedup.png -------------------------------------------------------------------------------- /docs/images/scalability-plots/virgo/outdated/communication_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/virgo/outdated/communication_plot.png -------------------------------------------------------------------------------- /docs/images/scalability-plots/virgo/relative_epoch_time_speedup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/virgo/relative_epoch_time_speedup.png -------------------------------------------------------------------------------- /env-files/torch/requirements/cmcc-requirements.txt: -------------------------------------------------------------------------------- 1 | cartopy 2 | joblib 3 | lightning 4 | matplotlib 5 | munch 6 | pandas 7 | requests 8 | tqdm 9 | timm 10 | toml 11 | xarray 12 | zarr -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | extend-ignore = E203,W503 3 | max-line-length = 95 4 | exclude = .git,__pycache__,docs/conf.py,use-cases,tutorials,tests,old,build,dist,.venv*,envAI*,env-files,.vscode,ci 5 | -------------------------------------------------------------------------------- /tutorials/ml-workflows/01-pipeline-introduction/sample_pipeline_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/tutorials/ml-workflows/01-pipeline-introduction/sample_pipeline_1.jpg -------------------------------------------------------------------------------- /tutorials/ml-workflows/01-pipeline-introduction/sample_pipeline_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/tutorials/ml-workflows/01-pipeline-introduction/sample_pipeline_2.jpg -------------------------------------------------------------------------------- /COPYRIGHT: -------------------------------------------------------------------------------- 1 | This project is licensed under Apache-2.0. 2 | 3 | Copyrights in this project are retained by their contributors. 4 | No copyright assignment is required to contribute to this project. 5 | -------------------------------------------------------------------------------- /docs/api/cli.md: -------------------------------------------------------------------------------- 1 | # itwinai CLI reference placeholder 2 | 3 | Please overwrite this file before building the docs: 4 | 5 | ```bash 6 | typer itwinai.cli utils docs --output docs/api/cli.md 7 | ``` 8 | -------------------------------------------------------------------------------- /env-files/torch/requirements/README.md: -------------------------------------------------------------------------------- 1 | # Additional requirements 2 | 3 | This folder contains additional (optional) python dependencies, for instance 4 | interTwin use cases specific dependencies. 5 | -------------------------------------------------------------------------------- /docs/_static/custom.css: -------------------------------------------------------------------------------- 1 | /* Custom CSS for resizing the Sphinx logo */ 2 | .logo img { 3 | width: 150px; /* Adjust the width as needed */ 4 | height: auto; /* Maintain the aspect ratio */ 5 | } 6 | -------------------------------------------------------------------------------- /docs/api/itwinai.loggers.rst: -------------------------------------------------------------------------------- 1 | itwinai.loggers 2 | ================ 3 | 4 | .. automodule:: itwinai.loggers 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | :member-order: bysource 9 | -------------------------------------------------------------------------------- /docs/images/scalability-plots/eurac/outdated/absolute_scalability_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/eurac/outdated/absolute_scalability_plot.png -------------------------------------------------------------------------------- /docs/images/scalability-plots/eurac/outdated/relative_scalability_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/eurac/outdated/relative_scalability_plot.png -------------------------------------------------------------------------------- /docs/images/scalability-plots/virgo/outdated/absolute_scalability_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/virgo/outdated/absolute_scalability_plot.png -------------------------------------------------------------------------------- /docs/images/scalability-plots/virgo/outdated/relative_scalability_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interTwin-eu/itwinai/HEAD/docs/images/scalability-plots/virgo/outdated/relative_scalability_plot.png -------------------------------------------------------------------------------- /docs/api/itwinai.type.rst: -------------------------------------------------------------------------------- 1 | itwinai.type 2 | ============= 3 | 4 | .. automodule:: itwinai.type 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | :member-order: bysource 9 | 10 | -------------------------------------------------------------------------------- /docs/api/itwinai.utils.rst: -------------------------------------------------------------------------------- 1 | itwinai.utils 2 | ============= 3 | 4 | .. automodule:: itwinai.utils 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | :member-order: bysource 9 | 10 | -------------------------------------------------------------------------------- /docs/api/itwinai.parser.rst: -------------------------------------------------------------------------------- 1 | itwinai.parser 2 | ============== 3 | 4 | .. automodule:: itwinai.parser 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | :member-order: bysource 9 | 10 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: 2 3 | updates: 4 | # Maintain dependencies for GitHub Actions 5 | - package-ecosystem: "github-actions" 6 | directory: "/" 7 | schedule: 8 | interval: "daily" 9 | -------------------------------------------------------------------------------- /docs/api/itwinai.distributed.rst: -------------------------------------------------------------------------------- 1 | itwinai.distributed 2 | =================== 3 | 4 | .. automodule:: itwinai.distributed 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | :member-order: bysource 9 | -------------------------------------------------------------------------------- /docs/api/itwinai.pipeline.rst: -------------------------------------------------------------------------------- 1 | itwinai.pipeline 2 | ================ 3 | 4 | .. automodule:: itwinai.pipeline 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | :member-order: bysource 9 | 10 | -------------------------------------------------------------------------------- /docs/api/itwinai.components.rst: -------------------------------------------------------------------------------- 1 | itwinai.components 2 | ================== 3 | 4 | .. automodule:: itwinai.components 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | :member-order: bysource 9 | 10 | -------------------------------------------------------------------------------- /use-cases/xtclim/requirements.txt: -------------------------------------------------------------------------------- 1 | cartopy 2 | cftime 3 | codecarbon 4 | dask 5 | datetime 6 | imageio 7 | ipykernel 8 | matplotlib 9 | numpy 10 | pandas 11 | tqdm 12 | urllib3==1.26.* 13 | xarray 14 | netCDF4 15 | h5netcdf 16 | -------------------------------------------------------------------------------- /docs/api/itwinai.serialization.rst: -------------------------------------------------------------------------------- 1 | itwinai.serialization 2 | ===================== 3 | 4 | .. automodule:: itwinai.serialization 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | :member-order: bysource 9 | 10 | -------------------------------------------------------------------------------- /.github/linters/.markdownlint.json: -------------------------------------------------------------------------------- 1 | { 2 | "MD013": { 3 | "line_length": 120, 4 | "code_blocks": false, 5 | "tables": false 6 | }, 7 | "MD014": false, 8 | "MD024": false, 9 | "MD026": { 10 | "punctuation": ".,:;!" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /env-files/torch/jupyter/start-cloud.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | echo "[start.sh] Running setup.sh for Rucio (generates rucio.cfg)..." 5 | /usr/local/bin/setup.sh 6 | 7 | echo "[start.sh] Running original start.sh..." 8 | exec /usr/local/bin/start-original.sh "$@" 9 | -------------------------------------------------------------------------------- /use-cases/radio-astronomy/clean: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -rf outputs checkpoints plots/* 4 | rm models/trained_Filter_test_v0.pt 5 | rm models/trained_CNN1D_test_v0.pt 6 | rm models/trained_UNet_test_v0.pt 7 | rm -rf logs 8 | rm -rf mllogs ml-logs mlruns 9 | rm -f progress.out report.out -------------------------------------------------------------------------------- /use-cases/mnist/tensorflow/README.md: -------------------------------------------------------------------------------- 1 | # TensorFlow example on MNIST dataset 2 | 3 | **Integration author(s)**: Roman Machacek (CERN), Matteo Bunino (CERN) 4 | 5 | ## Training 6 | 7 | ```bash 8 | # Run the whole training pipeline 9 | itwinai exec-pipeline +pipe_key=pipeline 10 | ``` 11 | -------------------------------------------------------------------------------- /THIRD_PARTY_LICENSES: -------------------------------------------------------------------------------- 1 | The file `src/itwinai/flamegraph.pl` is from Brendan Gregg’s Flamegraph project 2 | (https://github.com/brendangregg/Flamegraph) and is licensed under the CDDL v1.0. It was 3 | copied unmodified on 2025-04-22. 4 | 5 | See `licenses/CDDL-1.0.txt` for the full license text. 6 | -------------------------------------------------------------------------------- /docs/use-cases/xtclim_doc.rst: -------------------------------------------------------------------------------- 1 | ML-based extreme events detection and characterization (xtclim, CERFACS) 2 | ======================================================================== 3 | 4 | .. include:: ../../use-cases/xtclim/README.md 5 | :parser: myst_parser.sphinx_ 6 | :start-line: 3 7 | -------------------------------------------------------------------------------- /use-cases/radio-astronomy/requirements.txt: -------------------------------------------------------------------------------- 1 | numpyencoder>=0.3.0 2 | pulsarrfi-nn @ git+https://gitlab.com/ml-ppa/pulsarrfi_nn.git@version_0.2#subdirectory=unet_semantic_segmentation 3 | pulsardt @ git+https://gitlab.com/ml-ppa/pulsardt@main 4 | ipywidgets 5 | pyqt6>=6.0 6 | pyquaternion>=0.9.9 7 | scikit-image>=0.22.0 8 | tqdm>=4.65.0 9 | -------------------------------------------------------------------------------- /ci/dagger.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "itwinai", 3 | "engineVersion": "v0.18.12", 4 | "sdk": { 5 | "source": "python" 6 | }, 7 | "dependencies": [ 8 | { 9 | "name": "k3s", 10 | "source": "github.com/marcosnils/daggerverse/k3s@k3s/v0.1.10", 11 | "pin": "28eea1fcf3b6ecb38a628186107760acd717442f" 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /src/itwinai/plugins/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | -------------------------------------------------------------------------------- /src/itwinai/torch/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to 7 | [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 8 | 9 | ## [Unreleased] 10 | 11 | ## [X.X.XX] 12 | - Change description (#PR_NUMBER) (AUTHOR) 13 | -------------------------------------------------------------------------------- /docs/api/cli_reference.rst: -------------------------------------------------------------------------------- 1 | CLI 2 | === 3 | 4 | Here you can find the itwinai CLI reference. 5 | 6 | .. cli_reference.md must be generated by typer using: 7 | .. $ typer itwinai.cli utils docs --output docs/api/cli.md 8 | .. More info: https://typer.tiangolo.com/tutorial/package/#generate-docs 9 | 10 | .. include:: cli.md 11 | :parser: myst_parser.sphinx_ 12 | :start-line: 2 -------------------------------------------------------------------------------- /src/itwinai/tensorflow/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | -------------------------------------------------------------------------------- /src/itwinai/torch/data/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | -------------------------------------------------------------------------------- /src/itwinai/torch/models/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | -------------------------------------------------------------------------------- /src/itwinai/tensorflow/data/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | -------------------------------------------------------------------------------- /src/itwinai/tensorflow/models/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | -------------------------------------------------------------------------------- /use-cases/radio-astronomy/.pytest-clean: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ### THIS IS A CLEAN-UP SCRIPT FOR THE TEST SUITE ### 4 | ### PLEASE DO NOT EDIT THIS FILE UNLESS WORKING ### 5 | ### ON THE TEST SUITE FOR THE RADIO-ASTRONOMY USE-CASE ### 6 | 7 | rm -rf outputs checkpoints plots/* 8 | rm -rf logs 9 | rm -rf mllogs ml-logs mlruns 10 | rm -f progress.out report.out 11 | rm -rf .test_dataset -------------------------------------------------------------------------------- /docs/api/modules.rst: -------------------------------------------------------------------------------- 1 | Python SDK 2 | ========== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | itwinai.components 8 | itwinai.distributed 9 | itwinai.loggers 10 | itwinai.parser 11 | itwinai.pipeline 12 | itwinai.scalability_report.modules 13 | itwinai.serialization 14 | itwinai.tests.modules 15 | itwinai.tf.modules 16 | itwinai.torch.modules 17 | itwinai.type 18 | itwinai.utils 19 | -------------------------------------------------------------------------------- /docs/tutorials/distrib-ml/torch-tutorial-GAN.rst: -------------------------------------------------------------------------------- 1 | GAN tutorial with PyTorch 2 | ========================= 3 | 4 | .. include:: ../../../tutorials/distributed-ml/torch-tutorial-GAN/README.md 5 | :parser: myst_parser.sphinx_ 6 | 7 | 8 | Python files 9 | ------------------ 10 | 11 | train.py 12 | ++++++++++++ 13 | 14 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-GAN/train.py 15 | :language: python 16 | -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-scaling-test/config/ddp.yaml: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | backend: nccl -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-scaling-test/config/deepspeed.yaml: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | backend: nccl -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-tutorial-0-basics/slurm_config.yaml: -------------------------------------------------------------------------------- 1 | account: intertwin 2 | time: 00:20:00 3 | partition: develbooster 4 | 5 | dist_strat: ddp 6 | std_out: slurm_job_logs/${dist_strat}.out 7 | err_out: slurm_job_logs/${dist_strat}.err 8 | job_name: ${dist_strat}-job 9 | 10 | python_venv: ../../../.venv/ 11 | 12 | num_nodes: 1 13 | gpus_per_node: 4 14 | cpus_per_task: 16 15 | 16 | training_cmd: "train.py -s {dist_strat}" 17 | -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-tutorial-GAN/slurm_config.yaml: -------------------------------------------------------------------------------- 1 | account: intertwin 2 | partition: develbooster 3 | time: 00:20:00 4 | 5 | dist_strat: ddp 6 | std_out: slurm_job_logs/${dist_strat}.out 7 | err_out: slurm_job_logs/${dist_strat}.err 8 | job_name: ${dist_strat}-job 9 | 10 | python_venv: ../../../.venv/ 11 | 12 | num_nodes: 1 13 | gpus_per_node: 4 14 | cpus_per_task: 16 15 | 16 | training_cmd: "train.py --strategy {dist_strat}" 17 | -------------------------------------------------------------------------------- /use-cases/mnist/torch/slurm_config.yaml: -------------------------------------------------------------------------------- 1 | num_nodes: 1 2 | gpus_per_node: 4 3 | python_venv: ../../../.venv/ 4 | account: s24r05-03-users 5 | partition: gpu 6 | 7 | dist_strat: ddp 8 | pipe_key: training_pipeline 9 | 10 | py_spy: false 11 | profiling_sampling_rate: 100 12 | 13 | training_cmd: "$(which itwinai) exec-pipeline \ 14 | strategy={dist_strat} \ 15 | checkpoints_location=checkpoints_{dist_strat} \ 16 | +pipe_key={pipe_key}" 17 | -------------------------------------------------------------------------------- /docs/tutorials/distrib-ml/torch_tutorial_0_basics.rst: -------------------------------------------------------------------------------- 1 | Introduction to distributed ML with PyTorch 2 | =============================================== 3 | 4 | .. include:: ../../../tutorials/distributed-ml/torch-tutorial-0-basics/README.md 5 | :parser: myst_parser.sphinx_ 6 | :start-line: 2 7 | 8 | train.py 9 | ++++++++ 10 | 11 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-0-basics/train.py 12 | :language: python 13 | -------------------------------------------------------------------------------- /docs/tutorials/workflows/04_itwinai_argparser.rst: -------------------------------------------------------------------------------- 1 | Integrating configuration with command line arguments 2 | ========================================================= 3 | 4 | 5 | .. include:: ../../../tutorials/ml-workflows/04-itwinai-argparser/README.md 6 | :parser: myst_parser.sphinx_ 7 | 8 | 9 | main.py 10 | --------- 11 | 12 | .. literalinclude:: ../../../tutorials/ml-workflows/04-itwinai-argparser/main.py 13 | :language: python 14 | 15 | 16 | -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-tutorial-1-mnist/slurm_config.yaml: -------------------------------------------------------------------------------- 1 | account: intertwin 2 | partition: develbooster 3 | time: 00:20:00 4 | 5 | dist_strat: ddp 6 | std_out: slurm_job_logs/${dist_strat}.out 7 | err_out: slurm_job_logs/${dist_strat}.err 8 | job_name: ${dist_strat}-job 9 | 10 | python_venv: ../../../.venv/ 11 | 12 | num_nodes: 1 13 | gpus_per_node: 4 14 | cpus_per_task: 16 15 | 16 | training_cmd: "train.py -s {dist_strat} -c config.yaml" 17 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 8 | 9 | # Summary 10 | 11 | 12 | 13 | --- 14 | 15 | 16 | 17 | **Related issue :** 18 | -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-scaling-test/config/horovod.yaml: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | fp16_allreduce: False 11 | use_adasum: False 12 | gradient_predivide_factor: 1.0 -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # https://help.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners 2 | # https://github.blog/2017-07-06-introducing-code-owners/ 3 | 4 | # Assign code owners that will automatically get asked to review Pull Requests 5 | # The last matching pattern takes the most precedence. 6 | 7 | # These owners will be the default owners for everything in the repo. 8 | # Unless a later match takes precedence, they will be requested for 9 | # review when someone opens a pull request. 10 | 11 | * @matbun -------------------------------------------------------------------------------- /use-cases/xtclim/src/initialization.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | # Mean-Squared Error as the average difference between the pixels 5 | # in the original image vs. the reconstructed one 6 | criterion = nn.MSELoss() 7 | # pixel-wise MSE loss 8 | pixel_wise_criterion = nn.MSELoss(reduction='none') 9 | 10 | # KL divergence handles dispersion of information in latent space 11 | # a balance is to be found with the prevailing reconstruction error 12 | beta = 0.1 13 | 14 | # number of evaluations for each dataset 15 | n_avg = 20 16 | -------------------------------------------------------------------------------- /ci/src/main/literals.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | import dagger 11 | from dagger import enum_type 12 | 13 | 14 | @enum_type 15 | class MLFramework(dagger.Enum): 16 | TORCH = "TORCH" 17 | TENSORFLOW = "TENSORFLOW" 18 | -------------------------------------------------------------------------------- /src/itwinai/tests/exceptions.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | """Custom exceptions raised during sanity checks for itwinai.""" 11 | 12 | 13 | class SanityCheckError(Exception): 14 | """Base exception for all sanity check errors.""" 15 | -------------------------------------------------------------------------------- /tutorials/ml-workflows/04-itwinai-argparser/README.md: -------------------------------------------------------------------------------- 1 | # itwinai ArgumentParser 2 | 3 | **Author(s)**: Matteo Bunino 4 | 5 | itwinai provides a wrapper of jsonarparse's ArgumentParser which supports 6 | configuration files by default. 7 | 8 | To run as usual: 9 | 10 | ```bash 11 | python main.py -d 20 --train-prop 0.7 --val-prop 0.2 --lr 1e-5 12 | ``` 13 | 14 | To reuse the parameters saved in a configuration file and override some 15 | parameter (e.g., learning rate): 16 | 17 | ```bash 18 | python main.py --config advanced_tutorial_conf.yaml --lr 2e-3 19 | ``` 20 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "ms-python.flake8", 4 | "streetsidesoftware.code-spell-checker", 5 | "njpwerner.autodocstring", 6 | "dlyz.md-link-checker", 7 | "davidanson.vscode-markdownlint", 8 | "ms-python.vscode-pylance", 9 | "ms-python.python", 10 | "bierner.markdown-mermaid", 11 | "tamasfe.even-better-toml", 12 | "charliermarsh.ruff", 13 | "github.vscode-github-actions", 14 | "dnut.rewrap-revived", 15 | "emilast.logfilehighlighter" 16 | ] 17 | } -------------------------------------------------------------------------------- /docs/tutorials/distrib-ml/tf_tutorial_1_imagenet.rst: -------------------------------------------------------------------------------- 1 | Tensorflow ImageNet example 2 | =========================== 3 | 4 | .. include:: ../../../tutorials/distributed-ml/tf-tutorial-1-imagenet/README.md 5 | :parser: myst_parser.sphinx_ 6 | 7 | train.py 8 | ++++++++ 9 | 10 | .. literalinclude:: ../../../tutorials/distributed-ml/tf-tutorial-1-imagenet/train.py 11 | :language: python 12 | 13 | 14 | tfmirrored_slurm.sh 15 | +++++++++++++++++++ 16 | 17 | .. literalinclude:: ../../../tutorials/distributed-ml/tf-tutorial-1-imagenet/tfmirrored_slurm.sh 18 | :language: bash 19 | 20 | -------------------------------------------------------------------------------- /docs/tutorials/distrib-ml/torch_tutorial_1_mnist.rst: -------------------------------------------------------------------------------- 1 | Distributed training on MNIST dataset 2 | ========================================== 3 | 4 | .. include:: ../../../tutorials/distributed-ml/torch-tutorial-1-mnist/README.md 5 | :parser: myst_parser.sphinx_ 6 | :start-line: 2 7 | 8 | train.py 9 | ++++++++ 10 | 11 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-1-mnist/train.py 12 | :language: python 13 | 14 | config.yaml 15 | +++++++++++ 16 | 17 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-1-mnist/config.yaml 18 | :language: yaml 19 | -------------------------------------------------------------------------------- /use-cases/mnist/torch-lightning/README.md: -------------------------------------------------------------------------------- 1 | # Torch Lightning example on MNIST dataset 2 | 3 | **Integration author(s)**: Matteo Bunino (CERN) 4 | 5 | ## Training 6 | 7 | ```bash 8 | # Download dataset and exit: only run first step in the pipeline (index=0) 9 | itwinai exec-pipeline +pipe_key=training_pipeline +pipe_steps=[0] 10 | 11 | # Run the whole training pipeline 12 | itwinai exec-pipeline +pipe_key=training_pipeline 13 | ``` 14 | 15 | View training logs on MLFLow server (if activated from the configuration): 16 | 17 | ```bash 18 | mlflow ui --backend-store-uri mllogs/mlflow/ 19 | ``` 20 | -------------------------------------------------------------------------------- /use-cases/cyclones/src/strategy.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | # gets the mirrored strategy based on whether or not we are running the model 5 | # with CPU or GPU 6 | def get_mirrored_strategy(cores=4): 7 | if cores: 8 | CPUs = ['CPU:'+str(i) for i in range(cores)] 9 | mirrored_strategy = tf.distribute.MirroredStrategy(CPUs) 10 | else: 11 | mirrored_strategy = tf.distribute.MirroredStrategy() 12 | 13 | print('Number of devices: {}'.format( 14 | mirrored_strategy.num_replicas_in_sync)) 15 | 16 | return mirrored_strategy, mirrored_strategy.num_replicas_in_sync 17 | -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-tutorial-2-trainer-class/README.md: -------------------------------------------------------------------------------- 1 | # Tutorial on itwinai TorchTrainer for MNIST use case 2 | 3 | **Author(s)**: Matteo Bunino (CERN) 4 | 5 | The code is adapted from [this example](https://github.com/pytorch/examples/blob/main/mnist/main.py). 6 | 7 | ## Run the script 8 | 9 | ```bash 10 | python train.py 11 | 12 | # With distributed training (interactive) 13 | torchrun --standalone --nnodes=1 --nproc-per-node=gpu train.py --strategy ddp 14 | ``` 15 | 16 | ## Analyze the logs 17 | 18 | Analyze the logs with MLFlow: 19 | 20 | ```bash 21 | itwinai mlflow-ui --path mllogs/mlflow 22 | ``` 23 | -------------------------------------------------------------------------------- /.github/linters/mlc_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "httpHeaders": [ 3 | { 4 | "urls": [ 5 | "https://docs.github.com/" 6 | ], 7 | "headers": { 8 | "Accept-Encoding": "zstd, br, gzip, deflate" 9 | } 10 | } 11 | ], 12 | "ignorePatterns": [ 13 | { 14 | "pattern": "^http://localhost" 15 | }, 16 | { 17 | "pattern": "^https://example.com" 18 | }, 19 | { 20 | "pattern": "docs/" 21 | }, 22 | { 23 | "pattern": "./" 24 | }, 25 | { 26 | "pattern": "../" 27 | }, 28 | { 29 | "pattern": "use-cases/" 30 | } 31 | ] 32 | } -------------------------------------------------------------------------------- /docs/tutorials/distrib-ml/tf_tutorial_0_basics.rst: -------------------------------------------------------------------------------- 1 | Introduction on distributed training with TensorFlow 2 | =========================================================== 3 | 4 | .. include:: ../../../tutorials/distributed-ml/tf-tutorial-0-basics/README.md 5 | :parser: myst_parser.sphinx_ 6 | 7 | train.py 8 | ++++++++ 9 | 10 | .. literalinclude:: ../../../tutorials/distributed-ml/tf-tutorial-0-basics/train.py 11 | :language: python 12 | 13 | 14 | tfmirrored_slurm.sh 15 | +++++++++++++++++++ 16 | 17 | .. literalinclude:: ../../../tutorials/distributed-ml/tf-tutorial-0-basics/tfmirrored_slurm.sh 18 | :language: bash 19 | 20 | -------------------------------------------------------------------------------- /env-files/torch/horovod-deepspeed-JSC.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Installation for JSC 4 | 5 | # Job configuration 6 | #SBATCH --job-name=setup_venv 7 | #SBATCH --account=intertwin 8 | #SBATCH --output=horovod_ds_installation.out 9 | #SBATCH --error=horovod_ds_installation.err 10 | #SBATCH --time=00:30:00 11 | 12 | # Resource allocation 13 | #SBATCH --partition=develbooster 14 | #SBATCH --nodes=1 15 | #SBATCH --gres=gpu 16 | 17 | ml --force purge 18 | ml Stages/2025 GCC OpenMPI CUDA/12 cuDNN MPI-settings/CUDA 19 | ml Python CMake HDF5 PnetCDF libaio mpi4py git 20 | 21 | source .venv/bin/activate 22 | source env-files/torch/install-horovod-deepspeed-cuda.sh 23 | -------------------------------------------------------------------------------- /docs/api/itwinai.tf.modules.rst: -------------------------------------------------------------------------------- 1 | itwinai.tensorflow 2 | ================== 3 | 4 | distributed.py 5 | ++++++++++++++ 6 | 7 | .. automodule:: itwinai.tensorflow.distributed 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | :member-order: bysource 12 | 13 | 14 | trainer.py 15 | +++++++++++ 16 | 17 | .. automodule:: itwinai.tensorflow.trainer 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | :member-order: bysource 22 | 23 | 24 | utils.py 25 | ++++++++ 26 | 27 | .. automodule:: itwinai.tensorflow.utils 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | :member-order: bysource 32 | 33 | -------------------------------------------------------------------------------- /docs/api/itwinai.tests.modules.rst: -------------------------------------------------------------------------------- 1 | itwinai.tests 2 | ============= 3 | 4 | 5 | dummy_components.py 6 | +++++++++++++++++++ 7 | 8 | .. automodule:: itwinai.tests.dummy_components 9 | :members: 10 | :undoc-members: 11 | :show-inheritance: 12 | :member-order: bysource 13 | 14 | 15 | exceptions.py 16 | +++++++++++++ 17 | 18 | .. automodule:: itwinai.tests.exceptions 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | :member-order: bysource 23 | 24 | 25 | sanity_check.py 26 | +++++++++++++++ 27 | 28 | .. automodule:: itwinai.tests.sanity_check 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | :member-order: bysource 33 | 34 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /use-cases/mnist/torch/Dockerfile: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | # Find more base image candidates under: 11 | # - https://github.com/interTwin-eu/itwinai/pkgs/container/itwinai 12 | # - https://github.com/interTwin-eu/itwinai/pkgs/container/itwinai-dev 13 | FROM ghcr.io/intertwin-eu/itwinai:torch-skinny-latest 14 | 15 | # Add torch MNIST use case 16 | COPY use-cases/mnist/torch/* ./ 17 | -------------------------------------------------------------------------------- /src/itwinai/type.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | """Framework-independent types.""" 11 | 12 | 13 | class MLArtifact: 14 | """A framework-independent machine learning artifact.""" 15 | 16 | 17 | class MLDataset(MLArtifact): 18 | """A framework-independent machine learning dataset.""" 19 | 20 | 21 | class MLModel(MLArtifact): 22 | """A framework-independent machine learning model.""" 23 | -------------------------------------------------------------------------------- /tutorials/distributed-ml/tf-tutorial-0-basics/README.md: -------------------------------------------------------------------------------- 1 | # Tutorial: distributed strategies for Tensorflow 2 | 3 | In this tutorial we show how to use Tensorflow `MultiWorkerMirroredStrategy`. 4 | Note that the environment is tested on the HDFML system at JSC. 5 | For other systems, the module versions might need change accordingly. 6 | Other strategies will be updated here. 7 | 8 | First, from the root of this repository, build the environment containing 9 | Tensorflow. You can *try* with: 10 | 11 | ```bash 12 | # Creates a Python venv called envAItf_hdfml 13 | make tf-gpu-jsc 14 | ``` 15 | 16 | If you want to distribute the code in `train.py`, run from terminal: 17 | 18 | ```bash 19 | sbatch tfmirrored_slurm.sh 20 | ``` 21 | -------------------------------------------------------------------------------- /tutorials/distributed-ml/tf-tutorial-1-imagenet/README.md: -------------------------------------------------------------------------------- 1 | # Tutorial: distributed strategies for Tensorflow 2 | 3 | In this tutorial we show how to use Tensorflow `MultiWorkerMirroredStrategy`. 4 | Note that the environment is tested on the HDFML system at JSC. 5 | For other systems, the module versions might need change accordingly. 6 | Other strategies will be updated here. 7 | 8 | First, from the root of this repository, build the environment containing 9 | Tensorflow. You can *try* with: 10 | 11 | ```bash 12 | # Creates a Python venv called envAItf_hdfml 13 | make tf-gpu-jsc 14 | ``` 15 | 16 | If you want to distribute the code in `train.py`, run from terminal: 17 | 18 | ```bash 19 | sbatch tfmirrored_slurm.sh 20 | ``` 21 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= -W -v 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/tutorials/distrib-ml/torch_tutorial_kubeflow_1.rst: -------------------------------------------------------------------------------- 1 | Tutorial on Kubeflow and TorchTrainer class 2 | =========================================== 3 | 4 | .. include:: ../../../tutorials/distributed-ml/torch-kubeflow-1/README.md 5 | :parser: myst_parser.sphinx_ 6 | 7 | 8 | train-cpu.py 9 | ++++++++++++ 10 | 11 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-kubeflow-1/train-cpu.py 12 | :language: python 13 | 14 | 15 | cpu.yaml 16 | ++++++++ 17 | 18 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-kubeflow-1/cpu.yaml 19 | :language: yaml 20 | 21 | Dockerfile 22 | ++++++++++ 23 | 24 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-kubeflow-1/Dockerfile 25 | :language: dockerfile 26 | -------------------------------------------------------------------------------- /.github/linters/.hadolint.yaml: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | failure-threshold: warning 11 | ignored: 12 | - DL3008 # Pin versions in apt get install. 13 | - DL3013 # Pin versions in pip. TODO: remove. 14 | - DL4001 # Either use Wget or Curl but not both 15 | - DL3003 # Use WORKDIR to switch to a directory 16 | - DL3006 # Always tag the version of an image explicitly: https://github.com/hadolint/hadolint/issues/339 -------------------------------------------------------------------------------- /env-files/docs/build-docs-jsc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # -------------------------------------------------------------------------------------- 4 | # Part of the interTwin Project: https://www.intertwin.eu/ 5 | # 6 | # Created by: Matteo Bunino 7 | # 8 | # Credit: 9 | # - Matteo Bunino - CERN 10 | # - Anna Lappe - CERN 11 | # -------------------------------------------------------------------------------------- 12 | 13 | # Build the documentation locally and serve it on localhost on JSC systems 14 | 15 | ml --force purge 16 | ml Stages/2023 GCCcore/.11.3.0 Python/3.10.4 Pandoc/2.19.2 17 | 18 | source .venv-docs/bin/activate 19 | cd docs 20 | make clean && make html && python -m http.server -d _build/html -------------------------------------------------------------------------------- /docs/use-cases/use_cases.rst: -------------------------------------------------------------------------------- 1 | Introduction 2 | ============ 3 | 4 | Each use case comes with their own tutorial on how to run it. Before running them, 5 | however, you should set up a Python virtual environment. 6 | 7 | After installing and activating the virtual environment, you will want to install the 8 | use-case specific dependencies, if applicable. This can be done by first ``cd``-ing 9 | into the use-case directory and then installing the requirements, as follows 10 | 11 | .. code-block:: bash 12 | 13 | cd use-cases/ 14 | pip install -r requirements.txt 15 | 16 | 17 | Alternatively, you can use the use-case Docker image, if available. After setting 18 | everything up, you can now run the use case as specified in the use case's tutorial. 19 | -------------------------------------------------------------------------------- /env-files/docs/create-docs-env-jsc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # -------------------------------------------------------------------------------------- 4 | # Part of the interTwin Project: https://www.intertwin.eu/ 5 | # 6 | # Created by: Matteo Bunino 7 | # 8 | # Credit: 9 | # - Matteo Bunino - CERN 10 | # -------------------------------------------------------------------------------------- 11 | 12 | # Create .venv-docs virtualenv to build the documentation locally on JSC systems 13 | 14 | ml --force purge 15 | ml Stages/2023 GCCcore/.11.3.0 Python/3.10.4 Pandoc/2.19.2 16 | 17 | cmake --version 18 | gcc --version 19 | 20 | rm -rf .venv-docs 21 | python -m venv .venv-docs 22 | source .venv-docs/bin/activate 23 | 24 | pip install -r docs/requirements.txt -------------------------------------------------------------------------------- /docs/tutorials/distrib-ml/tf_scaling_test.rst: -------------------------------------------------------------------------------- 1 | Tensorflow scaling test 2 | ======================= 3 | 4 | .. include:: ../../../tutorials/distributed-ml/tf-scaling-test-jube/README.md 5 | :parser: myst_parser.sphinx_ 6 | 7 | 8 | train.py 9 | ++++++++ 10 | 11 | .. literalinclude:: ../../../tutorials/distributed-ml/tf-scaling-test-jube/train.py 12 | :language: python 13 | 14 | 15 | jube_ddp.sh 16 | +++++++++++ 17 | 18 | .. literalinclude:: ../../../tutorials/distributed-ml/tf-scaling-test-jube/jube_ddp.sh 19 | :language: bash 20 | 21 | 22 | .. TODO: improve notebook rendering 23 | 24 | .. bench_plot.ipynb 25 | .. ++++++++++++++++ 26 | 27 | .. .. literalinclude:: ../../../tutorials/distributed-ml/tf-scaling-test-jube/bench_plot.ipynb 28 | .. :language: python 29 | -------------------------------------------------------------------------------- /docs/use-cases/mnist_doc.rst: -------------------------------------------------------------------------------- 1 | MNIST dataset 2 | ============= 3 | 4 | This section covers the MNIST use case. This use case has been implemented using three 5 | different strategies, ``TensorFlow``, ``PyTorch`` and ``PyTorch Lightning``. You can 6 | find the files relevant to this use case 7 | in the `use case's folder on Github `_. 8 | 9 | For more information on each implementation, consult their respective READMEs: 10 | 11 | Torch Lightning 12 | --------------- 13 | 14 | .. include:: ../../use-cases/mnist/torch-lightning/README.md 15 | :parser: myst_parser.sphinx_ 16 | :start-line: 2 17 | 18 | 19 | PyTorch 20 | ------- 21 | 22 | .. include:: ../../use-cases/mnist/torch/README.md 23 | :parser: myst_parser.sphinx_ 24 | :start-line: 2 25 | -------------------------------------------------------------------------------- /docs/use-cases/latticeqcd_doc.rst: -------------------------------------------------------------------------------- 1 | Normalizing flow for generating lattice field configurations (Lattice QCD, ETHZ/CSIC) 2 | ===================================================================================== 3 | 4 | The code is adapted from `this notebook `_ from the Lattice QCD use case. 5 | 6 | More information on the use case is available in the published deliverables, 7 | `D4.2 `_, 8 | `D7.2 `_ and `D7.4 `_. 9 | 10 | 11 | About the use-case and integration 12 | ---------------------------------- 13 | .. include:: ../../use-cases/lattice-qcd/README.md 14 | :parser: myst_parser.sphinx_ 15 | :start-after: 16 | :end-before: 17 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | from itwinai.cli import generate_slurm 4 | from itwinai.slurm.utils import get_slurm_job_parser 5 | 6 | 7 | def test_cli_slurm_function_signature(): 8 | """Test that function signature in cli.py matches argparser""" 9 | args = inspect.getfullargspec(generate_slurm).args 10 | parser = get_slurm_job_parser() 11 | 12 | ignored_args = ["print_config", "help"] 13 | parser_args = {arg.dest for arg in parser._actions} 14 | parser_args -= set(ignored_args) 15 | 16 | missing_in_function = parser_args - set(args) 17 | missing_in_parser = set(args) - parser_args 18 | 19 | assert not missing_in_function and not missing_in_parser, ( 20 | f"Arguments missing in function: {missing_in_function}, " 21 | f"Arguments missing in parser: {missing_in_parser}" 22 | ) 23 | -------------------------------------------------------------------------------- /MAINTAINERS.md: -------------------------------------------------------------------------------- 1 | # Authors 2 | 3 | ## Mantainers 4 | 5 | - Matteo Bunino - CERN - matteo.bunino\cern.ch 6 | - Jarl Sondre Saether - CERN - jarl.sondre.saether\cern.ch 7 | - Linus Eickhoff - CERN - linus.maximilian.eickhoff\cern.ch 8 | - Anna Elisa Lappe - CERN - anna.elisa.lappe\cern.ch 9 | - Rakesh Sarma - FZJ - r.sarma\fz-juelich.de 10 | 11 | ## Contributors 12 | 13 | - Kalliopi Tsolaki - CERN - kalliopi.tsolaki\cern.ch 14 | - Killian Verder - CERN - killian.verder\cern.ch 15 | - Henry Mutegeki - CERN - henry.mutegeki\cern.ch 16 | - Roman Machacek - CERN - roman.machacek\cern.ch 17 | - Alexander Zoechbauer - CERN - alexander.zoechbauer\cern.ch 18 | - Mario Ruettgers - FZJ - m.ruettgers\fz-juelich.de 19 | 20 | [Full contributors list](https://github.com/interTwin-eu/itwinai/graphs/contributors) 21 | -------------------------------------------------------------------------------- /docs/api/itwinai.scalability_report.modules.rst: -------------------------------------------------------------------------------- 1 | itwinai.scalability_report 2 | ========================== 3 | 4 | 5 | data.py 6 | +++++++ 7 | .. automodule:: itwinai.scalability_report.data 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | :member-order: bysource 12 | 13 | 14 | plot.py 15 | +++++++ 16 | .. automodule:: itwinai.scalability_report.plot 17 | :members: 18 | :undoc-members: 19 | :show-inheritance: 20 | :member-order: bysource 21 | 22 | 23 | reports.py 24 | ++++++++++ 25 | .. automodule:: itwinai.scalability_report.reports 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | :member-order: bysource 30 | 31 | 32 | utils.py 33 | ++++++++ 34 | .. automodule:: itwinai.scalability_report.utils 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | :member-order: bysource 39 | -------------------------------------------------------------------------------- /docs/use-cases/cyclones_doc.rst: -------------------------------------------------------------------------------- 1 | Tropical Cyclones Detection (CMCC) 2 | ================================== 3 | 4 | The code is adapted from the CMCC use case's 5 | `repository `_ and refers 6 | to a TensorFLow implementation. 7 | To know more on the interTwin tropical cyclones detection use case and its DT, please 8 | visit the published deliverables, `D4.1 `_, 9 | `D7.1 `_ and 10 | `D7.3 `_. 11 | You can find the relevant code in the 12 | `use case's folder on Github `_, 13 | or by consulting the use case's README: 14 | 15 | .. include:: ../../use-cases/cyclones/README.md 16 | :parser: myst_parser.sphinx_ 17 | :start-line: 2 18 | -------------------------------------------------------------------------------- /src/itwinai/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | from .dummy_components import ( 11 | FakeGetter, 12 | FakeGetterExec, 13 | FakePreproc, 14 | FakePreprocExec, 15 | FakeSaver, 16 | FakeSaverExec, 17 | FakeSplitter, 18 | FakeSplitterExec, 19 | FakeTrainer, 20 | FakeTrainerExec, 21 | ) 22 | 23 | _ = ( 24 | FakeGetter, 25 | FakeGetterExec, 26 | FakePreproc, 27 | FakePreprocExec, 28 | FakeSaver, 29 | FakeSaverExec, 30 | FakeSplitter, 31 | FakeSplitterExec, 32 | FakeTrainer, 33 | FakeTrainerExec, 34 | ) 35 | -------------------------------------------------------------------------------- /src/itwinai/torch/type.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | """Custom types definition.""" 11 | 12 | from typing import Callable 13 | 14 | import torch 15 | 16 | #: Torch data batch sampled by a ``DataLoader``. 17 | Batch = torch.Tensor 18 | 19 | #: Torch metric function provided by ``torchmetrics`` library. 20 | Metric = Callable 21 | 22 | 23 | class UninitializedStrategyError(Exception): 24 | """Error raised when a strategy has not been initialized.""" 25 | 26 | 27 | class DistributedStrategyError(Exception): 28 | """Error raised when a strategy has already been initialized.""" 29 | -------------------------------------------------------------------------------- /use-cases/virgo/synthetic-data-gen/data_generation_hdf5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --account=intertwin 4 | #SBATCH --output=array-job/job_%a.out 5 | #SBATCH --error=array-job/job_%a.err 6 | #SBATCH --time=00:07:00 7 | #SBATCH --mem-per-cpu=1G 8 | #SBATCH --partition=develbooster 9 | #SBATCH --array=1-75 10 | #SBATCH --job-name=generate_virgo_data 11 | #SBATCH --cpus-per-task=26 12 | 13 | # Load required modules 14 | ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py 15 | 16 | # Activate Python virtual environment 17 | source ../../envAI_juwels/bin/activate 18 | 19 | # Folder in which the datasets will be stored 20 | target_file="/p/scratch/intertwin/datasets/virgo_hdf5/virgo_data_${SLURM_ARRAY_TASK_ID}.hdf5" 21 | 22 | python synthetic-data-gen/file_gen_hdf5.py \ 23 | --num-datapoints 10000 \ 24 | --num-processes 25 \ 25 | --save-frequency 1000 \ 26 | --save-location "$target_file" 27 | 28 | -------------------------------------------------------------------------------- /use-cases/3dgan/Dockerfile: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | FROM nvcr.io/nvidia/pytorch:23.09-py3 11 | 12 | WORKDIR /usr/src/app 13 | 14 | # Install itwinai 15 | COPY pyproject.toml ./ 16 | COPY src ./ 17 | RUN pip install --upgrade pip \ 18 | && pip install --no-cache-dir lightning \ 19 | && pip install --no-cache-dir . 20 | 21 | # Add 3DGAN use case files and install additional requirements 22 | COPY use-cases/3dgan/requirements.txt ./ 23 | COPY use-cases/3dgan/* ./ 24 | RUN pip install --no-cache-dir -r requirements.txt 25 | 26 | # ENTRYPOINT [ "itwinai", "exec-pipeline" ] 27 | # CMD [ "--config", "pipeline.yaml" ] -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /use-cases/xtclim/train.py: -------------------------------------------------------------------------------- 1 | """ 2 | Train file to launch pipeline 3 | """ 4 | 5 | import os 6 | import sys 7 | from itwinai.parser import ConfigParser 8 | from itwinai.utils import load_yaml 9 | 10 | sys.path.append(os.path.join(os.path.dirname(__file__), "src")) 11 | sys.path.append(os.path.join(os.path.dirname(__file__), "preprocessing")) 12 | 13 | 14 | if __name__ == "__main__": 15 | 16 | config = load_yaml('pipeline.yaml') 17 | seasons_list = config['seasons'] 18 | 19 | for season in seasons_list: 20 | model_uri = f"outputs/cvae_model_{season}1d_1memb.pth" 21 | override_dict = { 22 | 'season': season, 23 | 'model_uri': model_uri 24 | } 25 | pipe_parser = ConfigParser( 26 | config=config, 27 | override_keys=override_dict 28 | ) 29 | pipeline = pipe_parser.parse_pipeline() 30 | 31 | print(f"Running pipeline for season: {season}") 32 | pipeline.execute() -------------------------------------------------------------------------------- /env-files/tensorflow/createEnvVegaTF.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------------------------------------------------- 5 | # Part of the interTwin Project: https://www.intertwin.eu/ 6 | # 7 | # Created by: Matteo Bunino 8 | # 9 | # Credit: 10 | # - Matteo Bunino - CERN 11 | # -------------------------------------------------------------------------------------- 12 | 13 | if [ ! -f "env-files/tensorflow/generic_tf.sh" ]; then 14 | echo "ERROR: env-files/tensorflow/generic_tf.sh not found!" 15 | exit 1 16 | fi 17 | 18 | # Load modules 19 | # NOTE: REFLECT THEM IN THE MAIN README! 20 | ml --force purge 21 | ml Python 22 | ml CMake/3.24.3-GCCcore-11.3.0 23 | ml mpi4py 24 | ml OpenMPI 25 | ml CUDA/11.7 26 | ml GCCcore/11.3.0 27 | ml NCCL/2.12.12-GCCcore-11.3.0-CUDA-11.7.0 28 | ml cuDNN 29 | 30 | 31 | # Create and install torch env 32 | export ENV_NAME=".venv-tf" 33 | bash env-files/tensorflow/generic_tf.sh -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-tutorial-1-mnist/config.yaml: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | # Data and logging 11 | data_dir: ./ 12 | log_int: 10 13 | verbose: True 14 | restart_int: 10 15 | download_only: False 16 | dataset_replication: 10 17 | shuff: False 18 | nworker: 4 # num workers dataloader 19 | prefetch: 2 20 | 21 | # Model 22 | batch_size: 64 23 | epochs: 2 24 | lr: 0.001 25 | momentum: 0.5 26 | 27 | # Reproducibility 28 | rnd_seed: 10 29 | 30 | # Distributed ML 31 | backend: nccl # ignored when using Horovod 32 | 33 | # Horovod: ignored when NOT using Horovod 34 | fp16_allreduce: False 35 | use_adasum: False 36 | gradient_predivide_factor: 1.0 37 | 38 | -------------------------------------------------------------------------------- /use-cases/mnist/torch-lightning/startscript: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # general configuration of the job 4 | #SBATCH --job-name=PrototypeTest 5 | #SBATCH --account=intertwin 6 | #SBATCH -o logs_slurm/job-2.out 7 | #SBATCH -e logs_slurm/job-2.err 8 | #SBATCH --time=00:30:00 9 | 10 | # configure node and process count on the CM 11 | #SBATCH --partition=develbooster 12 | #SBATCH --nodes=1 13 | #SBATCH --ntasks-per-node=4 14 | #SBATCH --cpus-per-task=4 15 | #SBATCH --gpus-per-node=4 16 | 17 | #SBATCH --exclusive 18 | 19 | # gres options have to be disabled for deepv 20 | #SBATCH --gres=gpu:4 21 | 22 | # load modules 23 | ml --force purge 24 | ml Stages/2024 GCC OpenMPI CUDA/12 cuDNN 25 | ml Python CMake HDF5 PnetCDF libaio 26 | 27 | # activate environment 28 | source ../../../envAI_juwels/bin/activate 29 | 30 | # ON LOGIN NODE download datasets: 31 | # ../../../.venv-pytorch/bin/itwinai exec-pipeline +pipe_key=training_pipeline +pipe_steps=[dataloading_step] 32 | 33 | srun itwinai exec-pipeline +pipe_steps=[1] -------------------------------------------------------------------------------- /.github/workflows/check-links.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Check links 3 | 4 | on: 5 | push: 6 | pull_request: 7 | 8 | jobs: 9 | markdown-link-check: 10 | name: Check links using markdown-link-check 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | # Checks out a copy of your repository on the ubuntu-latest machine 15 | - name: Checkout code 16 | uses: actions/checkout@v6 17 | with: 18 | # Make sure the actual branch is checked out when running on PR 19 | # ref: ${{ github.event.pull_request.head.sha }} 20 | # Full git history needed to get proper list of changed files 21 | fetch-depth: 0 22 | 23 | - name: Check links on new changes 24 | uses: gaurav-nelson/github-action-markdown-link-check@v1 25 | with: 26 | config-file: ".github/linters/mlc_config.json" 27 | check-modified-files-only: "yes" 28 | use-quiet-mode: "yes" 29 | use-verbose-mode: "yes" 30 | base-branch: "main" 31 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /env-files/torch/jupyter/README.md: -------------------------------------------------------------------------------- 1 | # JupyterLab image for itwinai with Rucio client 2 | 3 | The files in this folder are adapted from the work done by 4 | the [VRE team](https://github.com/vre-hub/environments). 5 | 6 | To build this container, go into the root of itwinai and run 7 | 8 | ```bash 9 | docker build -t : -f env-files/torch/jupyter/Dockerfile . 10 | ``` 11 | 12 | using your preferred `` and ``. 13 | 14 | ## Install custom dependencies 15 | 16 | To install custom dependencies (e.g., use cases packages) you can add them 17 | in a `requirements.txt` file, add it somewhere **in the itwinai directory** and pass 18 | it to the `docker build`: 19 | 20 | ```bash 21 | docker build -t : -f env-files/torch/jupyter/Dockerfile \ 22 | --build-arg REQUIREMENTS=path/to/requirements.txt . 23 | ``` 24 | 25 | For instance: 26 | 27 | ```bash 28 | docker build -t : -f env-files/torch/jupyter/Dockerfile \ 29 | --build-arg REQUIREMENTS=env-files/torch/requirements/cmcc-requirements.txt . 30 | ``` 31 | -------------------------------------------------------------------------------- /docs/tutorials/distrib-ml/kuberay-setup-tutorial.rst: -------------------------------------------------------------------------------- 1 | Distributed Machine Learning on HPC from k8s using KubeRay operator and interLink 2 | ================================================================================= 3 | 4 | .. include:: ../../../tutorials/distributed-ml/kuberay-setup-tutorial/README.md 5 | :parser: myst_parser.sphinx_ 6 | 7 | 8 | raycluster_example.yaml 9 | +++++++++++++++++++++++ 10 | 11 | This file defines the RayCluster, the file is referenced in the tutorial as the values file 12 | used by the KubeRay operator to deploy Ray 13 | clusters on Kubernetes. 14 | It specifies the configuration for head and worker nodes, including resource requests, 15 | environment variables, and startup commands. 16 | For a full reference of supported fields and structure, see the 17 | `Ray on Kubernetes config documentation `_ 18 | 19 | 20 | .. literalinclude:: ../../../tutorials/distributed-ml/kuberay-setup-tutorial/raycluster_example.yaml 21 | :language: yaml 22 | -------------------------------------------------------------------------------- /docs/tutorials/distrib-ml/torch_scaling_test.rst: -------------------------------------------------------------------------------- 1 | PyTorch scaling test 2 | ==================== 3 | 4 | .. include:: ../../../tutorials/distributed-ml/torch-scaling-test/README.md 5 | :parser: myst_parser.sphinx_ 6 | 7 | 8 | Plots of the scalability metrics 9 | -------------------------------- 10 | 11 | We have the following scalability metrics available: 12 | 13 | - Absolute wall-clock time comparison 14 | - Relative wall-clock time speedup 15 | - Computation vs. Other time 16 | - Communication vs. Computation time (deprecated) 17 | - GPU Utilization (%) 18 | - Power Consumption (Watt) 19 | 20 | You can see example plots of these in the 21 | :doc:`Virgo documentation <../../use-cases/virgo_doc>` or the 22 | :doc:`EURAC documentation <../../use-cases/eurac_doc>`. 23 | 24 | Additionally, we ran a larger scalability test with this tutorial on the full ImageNet 25 | dataset with the older script. This only shows the relative speedup and can be seen here: 26 | 27 | .. image:: ../../../tutorials/distributed-ml/torch-scaling-test/img/report.png 28 | 29 | -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-scaling-test/config/base.yaml: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | # Data and logging 11 | data_dir: /p/scratch/intertwin/datasets/imagenet/ILSVRC2012/train/ # tmp_data/ 12 | epoch_time_directory: scalability-metrics/epoch-time 13 | 14 | # Subset size can be an int or None. Cannot be larger than the length of the dataset. 15 | # If you wish to set it to "None", you must use "null" as that is what yaml expects 16 | subset_size: 5000 17 | log_int: 10 18 | 19 | # verbose: True 20 | nworker: 4 # num workers dataloader 21 | prefetch: 2 22 | 23 | # Model 24 | batch_size: 64 # micro batch size 25 | epochs: 10 26 | lr: 0.001 27 | momentum: 0.5 28 | shuff: False 29 | 30 | # Reproducibility 31 | rnd_seed: 10 32 | -------------------------------------------------------------------------------- /use-cases/3dgan/create_inference_sample.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | """Create a simple inference dataset sample and a checkpoint.""" 11 | 12 | import argparse 13 | import os 14 | 15 | import torch 16 | from model import ThreeDGAN 17 | 18 | 19 | def create_checkpoint(root: str = ".", ckpt_name: str = "3dgan-inference.pth"): 20 | ckpt_path = os.path.join(root, ckpt_name) 21 | net = ThreeDGAN() 22 | torch.save(net, ckpt_path) 23 | 24 | 25 | if __name__ == "__main__": 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument("--root", type=str, default=".") 28 | parser.add_argument("--ckpt-name", type=str, default="3dgan-inference.pth") 29 | args = parser.parse_args() 30 | create_checkpoint(**vars(args)) 31 | -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-tutorial-2-trainer-class/sample_srun.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # -------------------------------------------------------------------------------------- 4 | # Part of the interTwin Project: https://www.intertwin.eu/ 5 | # 6 | # Created by: Jarl Sondre Sæther 7 | # 8 | # Credit: 9 | # - Jarl Sondre Sæther - CERN 10 | # -------------------------------------------------------------------------------------- 11 | # This file contains the sample bash code that was used in the interTwin presentation 12 | # held on Feb. 18. It is meant to illustrate how to combine srun and torchrun to launch 13 | # processes in parallel that can communicate and thus facilitate distributed ML. 14 | 15 | srun --cpu-bind=none --ntasks-per-node=1 \ 16 | bash -c "torchrun \ 17 | --nnodes=2 \ 18 | --nproc_per_node=4 \ 19 | --rdzv_id=151152 \ 20 | --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ 21 | --rdzv_backend=c10d \ 22 | --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ 23 | python train.py" 24 | -------------------------------------------------------------------------------- /env-files/torch/createEnvVega.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------------------------------------------------- 5 | # Part of the interTwin Project: https://www.intertwin.eu/ 6 | # 7 | # Created by: Matteo Bunino 8 | # 9 | # Credit: 10 | # - Matteo Bunino - CERN 11 | # -------------------------------------------------------------------------------------- 12 | 13 | if [ ! -f "env-files/torch/generic_torch.sh" ]; then 14 | echo "ERROR: env-files/torch/generic_torch.sh not found!" 15 | exit 1 16 | fi 17 | 18 | # Load modules 19 | # NOTE: REFLECT THEM IN THE MAIN README! 20 | ml --force purge 21 | ml CMake/3.29.3-GCCcore-13.3.0 22 | ml mpi4py/3.1.5 23 | ml OpenMPI/4.1.6-GCC-13.2.0 24 | ml cuDNN/8.9.7.29-CUDA-12.3.0 25 | ml CUDA/12.6.0 26 | ml NCCL/2.22.3-GCCcore-13.3.0-CUDA-12.6.0 27 | ml Python/3.12.3-GCCcore-13.3.0 28 | 29 | # You should have CUDA 12.6 now 30 | 31 | 32 | # Create and install torch env 33 | export ENV_NAME=".venv-pytorch" 34 | export PIP_INDEX_TORCH_CUDA="https://download.pytorch.org/whl/cu126" 35 | bash env-files/torch/generic_torch.sh 36 | -------------------------------------------------------------------------------- /src/itwinai/tensorflow/utils.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | 11 | import json 12 | 13 | import keras 14 | 15 | 16 | def model_to_json(model: keras.Model, filepath: str): 17 | """Serialize Keras model to JSON file. 18 | 19 | Args: 20 | model (keras.Model): Keras model. 21 | filepath (str): JSON file path. 22 | """ 23 | with open(filepath, "w") as f: 24 | json.dump(model.to_json(), f) 25 | 26 | 27 | def model_from_json(filepath: str) -> keras.Model: 28 | """Deserialize Keras model from JSON file. 29 | 30 | Args: 31 | filepath (str): JSON file path. 32 | 33 | Returns: 34 | keras.Model: loaded Keras model. 35 | """ 36 | with open(filepath, "r") as f: 37 | config = json.load(f) 38 | return keras.models.model_from_json(config) 39 | -------------------------------------------------------------------------------- /env-files/torch/jupyter/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | python /opt/setup-rucio-jupyterlab/configure.py 4 | 5 | # Creation of the rucio.cfg file 6 | mkdir -p /certs /tmp; 7 | echo -n $RUCIO_ACCESS_TOKEN > /tmp/rucio_oauth.token; 8 | # mkdir -p /opt/rucio/etc; 9 | # echo "[client]" >> /opt/rucio/etc/rucio.cfg; 10 | # echo "rucio_host = https://rucio-intertwin-testbed.desy.de" >> /opt/rucio/etc/rucio.cfg; 11 | # echo "auth_host = https://rucio-intertwin-testbed-auth.desy.de" >> /opt/rucio/etc/rucio.cfg; 12 | # #echo "ca_cert = /certs/rucio_ca.pem" >> /opt/rucio/etc/rucio.cfg; 13 | # echo "ca_cert = /opt/conda/lib/python3.9/site-packages/certifi/cacert.pem" >> /opt/rucio/etc/rucio.cfg; 14 | # echo "account = $JUPYTERHUB_USER" >> /opt/rucio/etc/rucio.cfg; 15 | # echo "auth_type = oidc" >> /opt/rucio/etc/rucio.cfg; 16 | # echo "oidc_audience = rucio-testbed" >> /opt/rucio/etc/rucio.cfg; 17 | # echo "oidc_polling = true" >> /opt/rucio/etc/rucio.cfg; 18 | # echo "oidc_scope = openid profile offline_access eduperson_entitlement" >> /opt/rucio/etc/rucio.cfg; 19 | # echo "auth_token_file_path = /tmp/rucio_oauth.token" >> /opt/rucio/etc/rucio.cfg; 20 | 21 | exec "$@" -------------------------------------------------------------------------------- /ci/src/main/utils.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | 11 | def get_codename(release_info: str) -> str: 12 | """ 13 | Extracts the codename (VERSION_CODENAME or os_version) from release information. 14 | 15 | Args: 16 | release_info (str): The string containing the output of /etc/*-release. 17 | 18 | Returns: 19 | str: The extracted codename (e.g., "jammy" or "bookworm"). 20 | """ 21 | # Create a dictionary from the release info 22 | release_dict = {} 23 | for line in release_info.splitlines(): 24 | if "=" in line: 25 | key, value = line.split("=", 1) 26 | release_dict[key.strip()] = value.strip().strip('"') 27 | 28 | # Attempt to extract the codename 29 | return release_dict.get("VERSION_CODENAME", release_dict.get("os_version", "Unknown")) 30 | -------------------------------------------------------------------------------- /use-cases/3dgan/downsample_h5py_file.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | """Downsample H5 files to a more manageable size.""" 11 | 12 | import h5py 13 | 14 | IN_FILENAME = "large_file.h5" 15 | OUT_FILENAME = "sample.h5" 16 | MAXITEMS = 100 17 | 18 | with h5py.File(IN_FILENAME, "r") as input_file: 19 | with h5py.File(OUT_FILENAME, "w") as outfile: 20 | for key in input_file.keys(): 21 | print(input_file[key]) 22 | shape = list(input_file[key].shape) 23 | shape[0] = MAXITEMS 24 | outfile.create_dataset_like(name=key, other=input_file[key], shape=tuple(shape)) 25 | print(outfile[key]) 26 | outfile[key][...] = input_file[key][:MAXITEMS] 27 | 28 | print("verify similarities") 29 | print(input_file["energy"][:10]) 30 | print(outfile["energy"][:10]) 31 | -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-tutorial-containers/model.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | 11 | import torch.nn.functional as F 12 | from torch import nn 13 | 14 | 15 | class Net(nn.Module): 16 | def __init__(self): 17 | super(Net, self).__init__() 18 | self.conv1 = nn.Conv2d(1, 10, kernel_size=5) 19 | self.conv2 = nn.Conv2d(10, 20, kernel_size=5) 20 | self.conv2_drop = nn.Dropout2d() 21 | self.fc1 = nn.Linear(320, 50) 22 | self.fc2 = nn.Linear(50, 10) 23 | 24 | def forward(self, x): 25 | x = F.relu(F.max_pool2d(self.conv1(x), 2)) 26 | x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) 27 | x = x.view(-1, 320) 28 | x = F.relu(self.fc1(x)) 29 | x = F.dropout(x, training=self.training) 30 | x = self.fc2(x) 31 | return F.log_softmax(x, dim=0) 32 | -------------------------------------------------------------------------------- /src/itwinai/slurm/sample_slurm_config.yaml: -------------------------------------------------------------------------------- 1 | job_name: my_slurm_job 2 | 3 | account: intertwin 4 | partition: develbooster 5 | 6 | # HH:MM:SS 7 | time: 00:11:11 8 | 9 | # Keep in mind that these will be overwritten if "mode" is not "single", and that 10 | # if you override the dist_strat in the CLI, then these will already have evaluated 11 | # and thus might not correspond. Thus, we suggest you only change the dist_strat in 12 | # the config and avoid overriding it in the CLI. 13 | std_out: slurm_job_logs/${dist_strat}-${num_nodes}x${gpus_per_node}.out 14 | err_out: slurm_job_logs/${dist_strat}-${num_nodes}x${gpus_per_node}.err 15 | 16 | num_nodes: 1 17 | gpus_per_node: 4 18 | cpus_per_task: 16 19 | memory: 16G 20 | 21 | # The distributed strategy can be "ddp", "deepspeed" or "horovod" 22 | dist_strat: ddp 23 | python_venv: .venv 24 | exp_name: my_experiment 25 | run_name: my_run 26 | exclusive: False 27 | 28 | # Make sure the below strategy matches the one above 29 | training_cmd: | 30 | $(which itwinai) exec-pipeline \ 31 | --config config.yaml \ 32 | --pipe-key rnn_training_pipeline \ 33 | strategy={dist_strat} \ 34 | experiment_name={experiment_name} \ 35 | run_name={run_name} 36 | -------------------------------------------------------------------------------- /use-cases/README.md: -------------------------------------------------------------------------------- 1 | # interTwin use cases integrated into itwinai 2 | 3 | Show how `itwinai` can be used to support scientific use cases. Each use case folder contains: 4 | 5 | - A YAML configuration file describing the ML workflows for that use case. 6 | - A SLURM job script, used to execute the ML workflows on a SLURM-based cluster. 7 | - `requirements.txt`: (optional) use case-specific requirements. can be installed with: 8 | 9 | ```bash 10 | cd use/case/folder 11 | # After activating the correct environment... 12 | pip install -r requirements.txt 13 | ``` 14 | 15 | ## How to run a use case 16 | 17 | First, create the use case's Python environment (i.e., PyTorch or TensorFlow) 18 | as described [in the main README](../README.md#environment-setup), and activate it. 19 | Then, install use case-specific dependencies, if any: 20 | 21 | ```bash 22 | pip install -r /use/case/path/requirements.txt 23 | ``` 24 | 25 | Alternatively, you can use the use case Docker image, if available. 26 | 27 | Then, go to the use case's directory: 28 | 29 | ```bash 30 | cd use/case/path 31 | ``` 32 | 33 | From there you can run the use case following the instruction provided in the use case's folder. 34 | -------------------------------------------------------------------------------- /tests/torch/test_config.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | import pytest 11 | from pydantic import ValidationError 12 | 13 | from itwinai.torch.config import TrainingConfiguration 14 | 15 | 16 | def test_values_parsing(): 17 | """Check dynamic override and creation of new entries.""" 18 | cfg = TrainingConfiguration(batch_size="11", param_abc="11", param_xyz=1.1) 19 | assert cfg.batch_size == 11 20 | assert cfg.param_abc == "11" 21 | assert cfg.param_xyz == 1.1 22 | assert isinstance(cfg.pin_gpu_memory, bool) 23 | 24 | # Check dict-like getitem 25 | assert cfg["batch_size"] == 11 26 | 27 | 28 | def test_illegal_override(): 29 | """Test that illegal type override fails.""" 30 | with pytest.raises(ValidationError) as exc_info: 31 | TrainingConfiguration(batch_size="hello") 32 | assert "batch_size" in str(exc_info.value) 33 | -------------------------------------------------------------------------------- /src/itwinai/slurm/slurm_constants.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Jarl Sondre Sæther 5 | # 6 | # Credit: 7 | # - Jarl Sondre Sæther - CERN 8 | # - Matteo Bunino - CERN 9 | # -------------------------------------------------------------------------------------- 10 | 11 | DEFAULT_SLURM_LOG_DIR = "slurm-job-logs" 12 | DEFAULT_SLURM_SAVE_DIR = "slurm-scripts" 13 | DEFAULT_PY_SPY_DIR = "py-spy-output" 14 | SLURM_TEMPLATE = r"""#!/bin/bash 15 | 16 | # Job configuration 17 | #SBATCH --job-name={job_name} 18 | #SBATCH --account={account} 19 | #SBATCH --partition={partition} 20 | #SBATCH --time={time} 21 | 22 | #SBATCH --output={std_out} 23 | #SBATCH --error={err_out} 24 | 25 | # Resource allocation 26 | #SBATCH --nodes={num_nodes} 27 | #SBATCH --ntasks-per-node={num_tasks_per_node} 28 | #SBATCH --cpus-per-task={cpus_per_task} 29 | #SBATCH --gpus-per-node={gpus_per_node} 30 | #SBATCH --gres=gpu:{gpus_per_node} 31 | #SBATCH --mem={memory} 32 | {exclusive_line} 33 | 34 | {pre_exec_command} 35 | 36 | {exec_command}""" 37 | -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-tutorial-containers/run_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # -------------------------------------------------------------------------------------- 4 | # Part of the interTwin Project: https://www.intertwin.eu/ 5 | # 6 | # Created by: Matteo Bunino 7 | # 8 | # Credit: 9 | # - Matteo Bunino - CERN 10 | # -------------------------------------------------------------------------------------- 11 | 12 | CMD="itwinai exec-pipeline" 13 | 14 | # Run command in the itwinai torch Docker container 15 | if [ -z "$1" ]; then 16 | # CPU only execution 17 | docker run -it --rm --name mnist-training --user $UID:$GID \ 18 | --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \ 19 | -v "$PWD":/use-case ghcr.io/intertwin-eu/itwinai:0.2.2-torch-2.1 \ 20 | /bin/bash -c "cd /use-case && $CMD" 21 | elif [ "$1" == "gpu" ]; then 22 | # With GPU support: --gpus all 23 | docker run -it --rm --name mnist-training --user $UID:$GID \ 24 | --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \ 25 | -v "$PWD":/use-case ghcr.io/intertwin-eu/itwinai:0.2.2-torch-2.1 \ 26 | /bin/bash -c "cd /use-case && $CMD" 27 | fi 28 | -------------------------------------------------------------------------------- /tests/run_on_jsc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # -------------------------------------------------------------------------------------- 4 | # Part of the interTwin Project: https://www.intertwin.eu/ 5 | # 6 | # Created by: Matteo Bunino 7 | # 8 | # Credit: 9 | # - Matteo Bunino - CERN 10 | # -------------------------------------------------------------------------------------- 11 | 12 | # Run tests on JSC environment 13 | # Set TORCH_ENV and TF_ENV variables below to use different 14 | # virtual environment names. 15 | 16 | ml --force purge 17 | ml Stages/2024 GCC OpenMPI CUDA/12 cuDNN MPI-settings/CUDA 18 | ml Python CMake HDF5 PnetCDF libaio 19 | 20 | export TORCH_ENV="envAI_hdfml" 21 | export TF_ENV="envAItf_hdfml" 22 | 23 | if [ ! -d "$TORCH_ENV" ]; then 24 | echo "$TORCH_ENV not found!" 25 | exit 1 26 | fi 27 | if [ ! -d "$TF_ENV" ]; then 28 | echo "$TF_ENV not found!" 29 | exit 1 30 | fi 31 | 32 | # Avoid downloading datasets from Gdrive 33 | export CERN_DATASET="/p/project1/intertwin/smalldata/3dgan-sample" 34 | export CMCCC_DATASET="/p/project1/intertwin/smalldata/cmcc" 35 | export MNIST_DATASET="/p/project1/intertwin/smalldata/mnist" 36 | 37 | $TORCH_ENV/bin/pytest -v tests/ -m "not slurm" -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | import os 11 | from pathlib import Path 12 | 13 | import pytest 14 | 15 | 16 | @pytest.fixture 17 | def torch_env() -> str: 18 | """If TORCH_ENV env variable is defined, it overrides the default 19 | torch virtual environment name. Otherwise, fall back 20 | to './.venv-pytorch'. 21 | 22 | Returns absolute path to torch virtual environment. 23 | """ 24 | env_path = Path(os.environ.get("TORCH_ENV", "./.venv-pytorch")) 25 | return str(env_path.resolve()) 26 | 27 | 28 | @pytest.fixture 29 | def tf_env() -> str: 30 | """If TF_ENV env variable is defined, it overrides the default 31 | torch virtual environment name. Otherwise, fall back 32 | to './.venv-tf'. 33 | 34 | Returns absolute path to torch virtual environment. 35 | """ 36 | env_path = Path(os.environ.get("TF_ENV", "./.venv-tf")) 37 | return str(env_path.resolve()) 38 | -------------------------------------------------------------------------------- /ci/pyproject.toml: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | [project] 11 | name = "main" 12 | version = "0.1.0" 13 | maintainers = [{ name = "Matteo Bunino", email = "matteo.bunino@cern.ch" }] 14 | authors = [{ name = "Matteo Bunino", email = "matteo.bunino@cern.ch" }] 15 | requires-python = ">=3.12" 16 | dependencies = [ 17 | "dagger-io", 18 | "pyyaml>=6.0.2", 19 | "ruff>=0.7.3", 20 | ] 21 | 22 | [tool.uv.sources] 23 | dagger-io = { path = "sdk", editable = true } 24 | 25 | [build-system] 26 | requires = ["hatchling==1.25.0"] 27 | build-backend = "hatchling.build" 28 | 29 | # Ruff configuration: https://docs.astral.sh/ruff/configuration/ 30 | [tool.ruff] 31 | line-length = 95 32 | 33 | [tool.ruff.lint] 34 | select = ["E", "F", "I", "W"] 35 | ignore = ["E203"] 36 | fixable = ["ALL"] 37 | 38 | [tool.ruff.format] 39 | quote-style = "double" 40 | indent-style = "space" 41 | skip-magic-trailing-comma = false 42 | line-ending = "auto" 43 | -------------------------------------------------------------------------------- /.github/workflows/sqaaas.yml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright contributors to the Software Quality Assurance as a Service (SQAaaS) project. 2 | # 3 | # SPDX-License-Identifier: GPL-3.0-only 4 | --- 5 | name: SQAaaS 6 | 7 | on: 8 | push: 9 | branches: [main] 10 | # pull_request: 11 | # branches: [main, dev] 12 | 13 | jobs: 14 | sqaaas_job: 15 | runs-on: ubuntu-latest 16 | name: Job that triggers SQAaaS platform 17 | steps: 18 | - name: Step definition for validating the workflow 19 | uses: eosc-synergy/sqaaas-step-action@v1 20 | with: 21 | name: workflow_validation_step 22 | tool: commands 23 | 24 | # Skipping tensorflow tests: make tensorflow-env-cpu 25 | commands: | 26 | make torch-env-cpu 27 | .venv-pytorch/bin/pytest -v ./tests/ --disable-warnings -n logical --dist loadfile -m "not hpc and not memory_heavy and not tensorflow" 28 | container: eoscsynergy/sqaaas-micromamba:1.5.3-1-rc.8 29 | - name: Print out payload 30 | run: cat workflow_validation_step.json 31 | - name: SQAaaS assessment with unit testing (QC.Uni) step 32 | uses: eosc-synergy/sqaaas-assessment-action@v2 33 | with: 34 | qc_uni_steps: workflow_validation_step 35 | -------------------------------------------------------------------------------- /use-cases/cyclones/src/transform.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def coo_rot180(data): 5 | X, y = data 6 | patch_size = X.shape[0] 7 | X = tf.image.rot90(X, k=2) 8 | y1 = [-1., -1.] 9 | if y[0] != -1: 10 | y1 = [-y[0] + patch_size - 1, -y[1] + patch_size - 1] 11 | return (X, y1) 12 | 13 | 14 | def coo_left_right(data): 15 | X, y = data 16 | patch_size = X.shape[0] 17 | X = tf.image.flip_left_right(X) 18 | y1 = [-1., -1.] 19 | if y[0] != -1: 20 | y1 = [y[0], - y[1] + patch_size - 1] 21 | return (X, y1) 22 | 23 | 24 | def coo_up_down(data): 25 | X, y = data 26 | patch_size = X.shape[0] 27 | X = tf.image.flip_up_down(X) 28 | y1 = [-1., -1.] 29 | if y[0] != -1: 30 | y1 = [- y[0] + patch_size - 1, y[1]] 31 | return (X, y1) 32 | 33 | 34 | def msk_rot180(data): 35 | X, Y = data 36 | X = tf.image.rot90(X, k=2) 37 | Y = tf.image.rot90(Y, k=2) 38 | return (X, Y) 39 | 40 | 41 | def msk_left_right(data): 42 | X, Y = data 43 | X = tf.image.flip_left_right(X) 44 | Y = tf.image.flip_left_right(Y) 45 | return (X, Y) 46 | 47 | 48 | def msk_up_down(data): 49 | X, Y = data 50 | X = tf.image.flip_up_down(X) 51 | Y = tf.image.flip_up_down(Y) 52 | return (X, Y) 53 | -------------------------------------------------------------------------------- /use-cases/xtclim/src/utils.py: -------------------------------------------------------------------------------- 1 | import imageio 2 | import numpy as np 3 | import torchvision.transforms as transforms 4 | import matplotlib.pyplot as plt 5 | from torchvision.utils import save_image 6 | 7 | to_pil_image = transforms.ToPILImage() 8 | 9 | def image_to_vid(images): 10 | # save evolving images along the learning and get the video 11 | imgs = [np.array(to_pil_image(img)) for img in images] 12 | imageio.mimsave('outputs/generated_images.gif', imgs) 13 | 14 | def save_reconstructed_images(recon_images, epoch, season = ''): 15 | # save all reconstructed images at each epoch 16 | save_image(recon_images.cpu(), f"outputs/image_record/{season}output{epoch}.jpg") 17 | 18 | def save_ex(recon_ex, epoch, season = ''): 19 | # save an example of image at a given epoch 20 | save_image(recon_ex.cpu(), f"outputs/image_record/{season}ex{epoch}.jpg") 21 | 22 | def save_loss_plot(train_loss, valid_loss, season = ''): 23 | # saves the plot of both losses evolutions 24 | plt.figure(figsize=(10, 7)) 25 | plt.plot(train_loss, color='orange', label='train loss') 26 | plt.plot(valid_loss, color='red', label='validation loss') 27 | plt.xlabel('Epochs') 28 | plt.ylabel('Loss') 29 | plt.legend() 30 | plt.savefig(f'outputs/{season}loss.jpg') 31 | plt.show() 32 | -------------------------------------------------------------------------------- /use-cases/lattice-qcd/train.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from normflow import Model, Fitter 3 | from normflow.nn import DistConvertor_ 4 | from normflow.action import ScalarPhi4Action 5 | from normflow.prior import NormalPrior 6 | 7 | def make_model(): 8 | net_ = DistConvertor_(10, symmetric=True) 9 | prior = NormalPrior(shape=(1,)) 10 | action = ScalarPhi4Action(kappa=0, m_sq=-1.2, lambd=0.5) 11 | 12 | return Model(net_=net_, prior=prior, action=action) 13 | 14 | def fit_func(model, n_epochs=100, strategy='ddp'): 15 | """Training function to fit model.""" 16 | 17 | config = { 18 | "optim_lr": 0.001, 19 | "weight_decay": 0.01, 20 | "ckpt_disp": False, 21 | "batch_size": 128, 22 | "save_every": "None", 23 | "optimizer_class": "torch.optim.AdamW", 24 | "scheduler": "None", 25 | "loss_fn": "None", 26 | "print_stride": 10, 27 | "print_batch_size": 1024, 28 | "snapshot_path": None, 29 | "epochs_run": 0 30 | } 31 | # Initialize the Fitter and execute the training 32 | fitter = Fitter(model=model, epochs=n_epochs, config=config, strategy=strategy) 33 | fitter.execute() 34 | 35 | def main(): 36 | model = make_model() 37 | fit_func(model) 38 | 39 | if __name__ == "__main__": 40 | main() 41 | -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-kubeflow-1/Dockerfile: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | FROM python:3.11-slim-bullseye 11 | 12 | WORKDIR /app 13 | 14 | RUN apt-get update && apt-get install -y \ 15 | git \ 16 | && apt-get clean -y && rm -rf /var/lib/apt/lists/* 17 | 18 | COPY pyproject.toml pyproject.toml 19 | COPY src src 20 | RUN pip install --no-cache-dir --upgrade pip setuptools wheel \ 21 | && pip install --no-cache-dir ".[torch]" --extra-index-url https://download.pytorch.org/whl/cpu 22 | 23 | COPY tutorials/distributed-ml/torch-k8s/train-cpu.py train-cpu.py 24 | 25 | LABEL org.opencontainers.image.authors="Matteo Bunino - matteo.bunino@cern.ch" 26 | LABEL org.opencontainers.image.url="https://github.com/interTwin-eu/itwinai" 27 | LABEL org.opencontainers.image.documentation="https://itwinai.readthedocs.io/" 28 | LABEL org.opencontainers.image.source="https://github.com/interTwin-eu/itwinai" 29 | LABEL org.opencontainers.image.vendor="CERN - European Organization for Nuclear Research" -------------------------------------------------------------------------------- /use-cases/mnist/tensorflow/startscript.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # general configuration of the job 4 | #SBATCH --job-name=PrototypeTest 5 | #SBATCH --account=intertwin 6 | #SBATCH --mail-user= 7 | #SBATCH --mail-type=ALL 8 | #SBATCH --output=job.out 9 | #SBATCH --error=job.err 10 | #SBATCH --time=00:30:00 11 | 12 | # configure node and process count on the CM 13 | #SBATCH --partition=batch 14 | #SBATCH --nodes=2 15 | #SBATCH --ntasks-per-node=1 16 | #SBATCH --cpus-per-task=4 17 | #SBATCH --gpus-per-node=4 18 | 19 | #SBATCH --exclusive 20 | 21 | # gres options have to be disabled for deepv 22 | #SBATCH --gres=gpu:4 23 | 24 | # load modules 25 | ml --force purge 26 | ml Stages/2024 GCC/12.3.0 OpenMPI CUDA/12 MPI-settings/CUDA Python/3.11 HDF5 PnetCDF libaio mpi4py CMake cuDNN/8.9.5.29-CUDA-12 27 | 28 | # shellcheck source=/dev/null 29 | source ~/.bashrc 30 | 31 | # Using legacy (2.16) version of Keras 32 | # Latest version with TF (2.16) installs Keras 3.3 33 | # which returns an error for multi-node execution 34 | export TF_USE_LEGACY_KERAS=1 35 | 36 | # ON LOGIN NODE download datasets: 37 | # ../../../.venv-tf/bin/itwinai exec-pipeline --config_name pipeline +pipe_key=pipeline +pipe_steps=[0] 38 | source ../../../envAItf_hdfml/bin/activate 39 | srun itwinai exec-pipeline --config-name pipeline +pipe_key=pipeline verbose=2 40 | -------------------------------------------------------------------------------- /docs/how-it-works/training/training.rst: -------------------------------------------------------------------------------- 1 | Training a neural network 2 | =========================== 3 | 4 | **Author(s)**: Matteo Bunino (CERN) 5 | 6 | itwinai aims at simplifying the way you train deep learning models, helping you to scale training to HPC resources, 7 | while integrating popular logging frameworks, such as MLFlow, Weights&Biases, and Tensorboard. 8 | 9 | itwinai TorchTrainer 10 | ------------------------- 11 | 12 | Below, you can find some tutorials that will help you getting familiar with the itwinai **TorchTrainer**: 13 | 14 | .. raw:: html 15 | 16 | 17 | 18 | | 19 | 20 | .. raw:: html 21 | 22 | 23 | 24 | | 25 | 26 | .. include:: explain_ddp.rst 27 | -------------------------------------------------------------------------------- /env-files/tensorflow/generic_tf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # -------------------------------------------------------------------------------------- 4 | # Part of the interTwin Project: https://www.intertwin.eu/ 5 | # 6 | # Created by: Matteo Bunino 7 | # 8 | # Credit: 9 | # - Jarl Sondre Sæther - CERN 10 | # - Matteo Bunino - CERN 11 | # -------------------------------------------------------------------------------------- 12 | 13 | if [ -z "$ENV_NAME" ]; then 14 | ENV_NAME=".venv-tf" 15 | fi 16 | 17 | work_dir=$PWD 18 | 19 | # Create the python venv if it doesn't already exist 20 | if [ -d "${work_dir}/$ENV_NAME" ];then 21 | echo "env $ENV_NAME already exists" 22 | else 23 | python3 -m venv $ENV_NAME 24 | echo "$ENV_NAME environment is created in ${work_dir}" 25 | fi 26 | 27 | source $ENV_NAME/bin/activate 28 | 29 | if [ -z "$NO_CUDA" ]; then 30 | TF_EXTRA="tf" 31 | else 32 | TF_EXTRA="tf-cuda" 33 | fi 34 | pip install --no-cache-dir -e ".[$TF_EXTRA,dev]" 35 | 36 | # Install Prov4ML 37 | if [[ "$(uname)" == "Darwin" ]]; then 38 | pip install --no-cache-dir "prov4ml[apple]@git+https://github.com/matbun/ProvML@v0.0.2" 39 | else 40 | # Assuming Nvidia GPUs are available 41 | pip install --no-cache-dir "prov4ml[nvidia]@git+https://github.com/matbun/ProvML@v0.0.2" 42 | fi -------------------------------------------------------------------------------- /use-cases/eurac/slurm_config.yaml: -------------------------------------------------------------------------------- 1 | # If you use this with in the runall or scaling mode, keep in mind that the strategies 2 | # will change, as well as the number of nodes. 3 | # 4 | # Default arguments can be seen in src/itwinai/slurm/utils.py 5 | 6 | mode: single # "single", "runall" or "scaling-test" - defaults to "single" 7 | dist_strat: ddp # "ddp", "deepspeed" or "horovod" 8 | 9 | account: intertwin 10 | time: 02:00:00 11 | partition: develbooster 12 | 13 | std_out: slurm_job_logs/${dist_strat}.out 14 | err_out: slurm_job_logs/${dist_strat}.err 15 | job_name: eurac-${dist_strat}-job 16 | 17 | num_nodes: 2 18 | num_tasks_per_node: 1 19 | gpus_per_node: 4 20 | cpus_per_task: 16 21 | 22 | python_venv: ../../.venv 23 | pipe_key: training_pipeline 24 | config_path: . 25 | config_name: config 26 | 27 | # The different number of nodes to use for the scalability testing 28 | scalability_nodes: "1, 2, 4, 8" 29 | 30 | # Variables in the curly brackets, "{}", will be overridden by the builder 31 | training_cmd: "$(which itwinai) exec-pipeline \ 32 | --config-path {config_path} \ 33 | --config-name {config_name} 34 | +pipe_key={pipe_key} \ 35 | strategy={dist_strat}" 36 | 37 | # WARNING: If you, in the CLI, override any of the variables specified in the curly 38 | # brackets above, there will likely be a mismatch in the builder, causing potential 39 | # bugs. 40 | -------------------------------------------------------------------------------- /.github/workflows/pypi.yml: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | name: Upload Python Package to PyPI when a Release is Created 11 | 12 | on: 13 | release: 14 | types: [created] 15 | 16 | jobs: 17 | pypi-publish: 18 | name: Publish release to PyPI 19 | runs-on: ubuntu-latest 20 | environment: 21 | name: pypi 22 | url: https://pypi.org/p/itwinai 23 | permissions: 24 | id-token: write 25 | steps: 26 | - uses: actions/checkout@v6 27 | - name: Set up Python 28 | uses: actions/setup-python@v6 29 | with: 30 | python-version: "3.x" 31 | - name: Install dependencies 32 | run: | 33 | python -m pip install --upgrade pip 34 | pip install -q build 35 | # pip install setuptools wheel 36 | - name: Build package 37 | run: python -m build 38 | # python setup.py sdist bdist_wheel # Could also be python -m build 39 | - name: Publish package distributions to PyPI 40 | uses: pypa/gh-action-pypi-publish@release/v1 -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | submodules: 9 | include: 10 | - tutorials/plugins 11 | recursive: true 12 | 13 | # Set the OS, Python version and other tools you might need 14 | build: 15 | os: ubuntu-22.04 16 | tools: 17 | python: "3.10" 18 | # You can also specify other tool versions: 19 | # nodejs: "19" 20 | # rust: "1.64" 21 | # golang: "1.19" 22 | apt_packages: 23 | - gcc-11 24 | - g++-11 25 | # - cmake 26 | - pandoc 27 | 28 | jobs: 29 | pre_build: 30 | - typer itwinai.cli utils docs --output docs/api/cli.md 31 | - python docs/convert_admonitions.py --dir docs/ 32 | 33 | # Build documentation in the "docs/" directory with Sphinx 34 | sphinx: 35 | configuration: docs/conf.py 36 | fail_on_warning: true # Equivalent to -W in the Makefile 37 | 38 | # Optionally build your docs in additional formats such as PDF and ePub 39 | # formats: 40 | # - pdf 41 | # - epub 42 | 43 | # Optional but recommended, declare the Python requirements required 44 | # to build your documentation 45 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 46 | python: 47 | install: 48 | - requirements: docs/requirements.txt 49 | -------------------------------------------------------------------------------- /use-cases/cyclones/README.md: -------------------------------------------------------------------------------- 1 | # Tropical cyclone detection 2 | 3 | **Integration author(s)**: Matteo Bunino (CERN), Roman Machacek (CERN), Mario Ruettgers (JSC) 4 | 5 | The code is adapted from the CMCC use case's 6 | [repository](https://github.com/CMCC-Foundation/ml-tropical-cyclones-detection). 7 | 8 | ## Setup env 9 | 10 | ```bash 11 | # After activating the environment 12 | pip install -r requirements.txt 13 | ``` 14 | 15 | ## Dataset 16 | 17 | If the automatic download from python does not work, try from the command line from 18 | within the virtual environment: 19 | 20 | ```bash 21 | gdown https://drive.google.com/drive/folders/1TnmujO4T-8_j4bCxqNe5HEw9njJIIBQD -O data/tmp_data/trainval --folder 22 | ``` 23 | 24 | For more info visit the [gdown](https://github.com/wkentaro/gdown) repository. 25 | 26 | ## Training 27 | 28 | Launch training: 29 | 30 | ```bash 31 | # # ONLY IF tensorflow>=2.16 32 | # export TF_USE_LEGACY_KERAS=1 33 | 34 | source ../../.venv-tf/bin/activate 35 | python train.py -p pipeline.yaml 36 | ``` 37 | 38 | On JSC, the dataset is pre-downloaded and you can use the following command: 39 | 40 | ```bash 41 | # # ONLY IF tensorflow>=2.16 42 | # export TF_USE_LEGACY_KERAS=1 43 | 44 | source ../../envAItf_hdfml/bin/activate 45 | python train.py -p pipeline.yaml --data_path /p/project/intertwin/smalldata/cmcc 46 | 47 | # Launch a job with SLURM 48 | sbatch startscript.sh 49 | ``` 50 | -------------------------------------------------------------------------------- /tutorials/distributed-ml/tf-scaling-test-jube/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking tutorial using JUBE 2 | 3 | Benchmarking of itwinai can also be performed with the JUBE Benchmarking Environment from JSC. 4 | The JUBE benchmarking tool is already setup in the environment files provided under `env-files`. 5 | 6 | ## Source the environment 7 | 8 | Find the location of your environment file along with the module load commands, such as: 9 | 10 | ```bash 11 | ml Stages/2024 GCC/12.3.0 OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py CMake cuDNN/8.9.5.29-CUDA-12 12 | source envAI_hdfml/bin/activate 13 | ``` 14 | 15 | ## Run benchmark 16 | 17 | The benchmarks are defined in the `general_jobsys.xml` file. 18 | One can specify the configurations in terms of parameters such as the number of nodes. 19 | The benchmark can be simply launched with the command: 20 | 21 | ```bash 22 | jube run general_jobsys.xml 23 | ``` 24 | 25 | ## Monitor status of benchmark run 26 | 27 | The status of the run can be monitored with: 28 | 29 | ```bash 30 | jube continue bench_run --id last 31 | ``` 32 | 33 | ## Check results of the benchmark run 34 | 35 | The results can be viewed with: 36 | 37 | ```bash 38 | jube result -a bench_run --id last 39 | ``` 40 | 41 | This will create `result-csv.dat` file in the `results` folder. 42 | 43 | The scaling and efficiency plots can be generated with the `bench_plot.ipynb` file 44 | which takes the `result-csv.dat` file as input. 45 | -------------------------------------------------------------------------------- /use-cases/3dgan/run-provenance-experiments.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # -------------------------------------------------------------------------------------- 4 | # Part of the interTwin Project: https://www.intertwin.eu/ 5 | # 6 | # Created by: Matteo Bunino 7 | # 8 | # Credit: 9 | # - Matteo Bunino - CERN 10 | # -------------------------------------------------------------------------------------- 11 | 12 | rm -rf slurm_logs mllogs 13 | mkdir slurm_logs 14 | 15 | # ========== FOR JSC ========== 16 | # ml --force purge 17 | # ml Stages/2024 GCC CUDA/12 cuDNN Python 18 | 19 | # SLURM_SCRIPT="slurm.jsc.sh" 20 | # source ../../envAI_hdfml/bin/activate 21 | # ========== FOR JSC ========== 22 | 23 | # ========== FOR Vega ========== 24 | SLURM_SCRIPT="slurm.vega.sh" 25 | source ../../.venv-pytorch/bin/activate 26 | # ========== FOR Vega ========== 27 | 28 | # Launch experiments 29 | 30 | # 1 worker: no SLURM needed 31 | itwinai exec-pipeline 1> slurm_logs/1_worker.out 2> slurm_logs/1_worker.err 32 | 33 | # 4, 8, 16... workers 34 | sbatch --wait --nodes=1 --output=slurm_logs/4_worker.out --error=slurm_logs/4_worker.err $SLURM_SCRIPT 35 | sbatch --wait --nodes=2 --output=slurm_logs/8_worker.out --error=slurm_logs/8_worker.err $SLURM_SCRIPT 36 | sbatch --wait --nodes=4 --output=slurm_logs/16_worker.out --error=slurm_logs/16_worker.err $SLURM_SCRIPT 37 | sbatch --wait --nodes=8 --output=slurm_logs/32_worker.out --error=slurm_logs/32_worker.err $SLURM_SCRIPT -------------------------------------------------------------------------------- /ci/src/main/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | """Dagger module for itwinai CI. 11 | 12 | This module provides logic to build containers, run tests with pytest, and more. 13 | 14 | Since itwinai is designed for HPC deployment, the containers need to be tested on relevant 15 | computing environments with hardware (e.g., GPUs) and software (e.g. CUDA) not accessible 16 | in standard GitHub actions VMs. Through an in-pipeline deployment of interLink, we can 17 | offload some tests to run on HPC. 18 | 19 | By deploying interLink within the CI pipeline, some tests can be offloaded to run on HPC. 20 | 21 | Additionally, since HPC systems prefer Singularity/Apptainer images over Docker, this 22 | module enables the conversion and publication of Docker containers as SIF files. 23 | 24 | Two CI pipelines are provided: a development pipeline, which is simpler and does not 25 | run tests on HPC, and a release pipeline, where containers undergo thorough testing on 26 | HPC, are converted to Singularity, and are pushed to both Docker and Singularity 27 | container registries. 28 | """ 29 | 30 | from .main import Itwinai as Itwinai 31 | 32 | __all__ = ["Itwinai"] 33 | -------------------------------------------------------------------------------- /tests/use-cases/conftest.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # - Anna Lappe - CERN 9 | # -------------------------------------------------------------------------------------- 10 | 11 | import os 12 | import subprocess 13 | from typing import Callable 14 | 15 | import pytest 16 | 17 | FNAMES = [ 18 | "pipeline.yaml", 19 | "startscript", 20 | ] 21 | 22 | 23 | @pytest.fixture 24 | def check_folder_structure() -> Callable: 25 | """Verify that the use case folder complies with some predefined structure.""" 26 | 27 | def _check_structure(root: str): 28 | for fname in FNAMES: 29 | fpath = os.path.join(root, fname) 30 | assert os.path.isfile(fpath), f"'{fname}' is missing in '{fpath}'" 31 | 32 | return _check_structure 33 | 34 | 35 | @pytest.fixture 36 | def install_requirements() -> Callable: 37 | """Install requirements.txt, if present in root folder.""" 38 | 39 | def _install_reqs(root: str, env_prefix: str): 40 | req_path = os.path.join(root, "requirements.txt") 41 | if os.path.isfile(req_path): 42 | cmd = f"{env_prefix}/bin/pip install --no-cache-dir -r {req_path}" 43 | subprocess.run(cmd.split(), check=True) 44 | 45 | return _install_reqs 46 | -------------------------------------------------------------------------------- /docs/getting-started/plugins-list.rst: -------------------------------------------------------------------------------- 1 | Current List of itwinai Plugins 2 | =============================== 3 | 4 | Below is a list of existing **itwinai plugins**, which correspond to scientific use cases that have been integrated into the itwinai framework. 5 | 6 | Physics Sciences 7 | ---------------- 8 | 9 | - `3DGAN – Fast Simulation of Particles in Calorimeters `__ 10 | - `GlitchFlow – Noise Generation for Gravitational Waves Analysis at Virgo `__ 11 | - `Pulsar Detection (Radio Astronomy) `__ 12 | - `Machine Learned Particle Flow Reconstruction (MLPF) `__ 13 | - `Normflow - Normalizing flows as a generative model for lattice field theory `__ 14 | 15 | Environmental Sciences 16 | ----------------------- 17 | 18 | - `Hython – Hydrological Modelling for Drought Early Warnings `__ 19 | - `xtclim – ML-based extreme events detection and characterization (CERFACS) `__ 20 | - `AtmoRep – A Stochastic Model of Atmosphere Dynamics `__ 21 | 22 | Contribute Your Plugin 23 | ----------------------- 24 | 25 | If you are developing a plugin and would like it to be listed on this page, feel free to open a pull request to update it! 26 | -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-scaling-test/slurm_config.yaml: -------------------------------------------------------------------------------- 1 | # If you use this with in the runall or scaling mode, keep in mind that the strategies 2 | # will change, as well as the number of nodes. 3 | # 4 | # Default arguments can be seen in src/itwinai/slurm/utils.py 5 | # 6 | num_nodes: 1 7 | num_tasks_per_node: 1 8 | gpus_per_node: 4 9 | cpus_per_task: 16 10 | 11 | mode: single # "single", "runall" or "scaling-test" - defaults to "single" 12 | dist_strat: ddp # "ddp", "deepspeed" or "horovod" 13 | itwinai_trainer: false 14 | 15 | account: intertwin 16 | time: 00:15:00 17 | partition: develbooster 18 | 19 | # Keep in mind that these will be overwritten if "mode" is not "single", and that 20 | # if you override the dist_strat in the CLI, then these will already have evaluated 21 | # and thus might not correspond. Thus, we suggest you only change the dist_strat in 22 | # the config and avoid overriding it in the CLI. 23 | std_out: slurm_job_logs/${dist_strat}-${num_nodes}x${gpus_per_node}.out 24 | err_out: slurm_job_logs/${dist_strat}-${num_nodes}x${gpus_per_node}.err 25 | job_name: tutorial-${dist_strat}-job 26 | 27 | # The different number of nodes to use for the scalability testing 28 | scalability_nodes: "1, 2, 4" 29 | 30 | python_venv: ../../../.venv 31 | 32 | # If you want to manually override the training command, comment in the following: 33 | # training_cmd: | 34 | # $(which itwinai) exec-pipeline \ 35 | # --config_path ${config_file} \ 36 | # +pipe_key ${pipe_key} \ 37 | # strategy=${dist_strat} \ 38 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 12 | 13 | # Short Description of the issue 14 | 15 | 20 | 21 | ## Environment 22 | 23 | 26 | 27 | - Operating System: 28 | - Other related components versions: 29 | 30 | ## Steps to reproduce 31 | 32 | 36 | 37 | ## Logs, stacktrace, or other symptoms 38 | 39 | 44 | 45 | ```shell 46 | output 47 | ``` 48 | 49 | 51 | 52 | # Summary of proposed changes 53 | -------------------------------------------------------------------------------- /src/itwinai/tensorflow/models/mnist.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Roman Machacek 5 | # 6 | # Credit: 7 | # - Roman Machacek - CERN 8 | # - Matteo Bunino - CERN 9 | # -------------------------------------------------------------------------------------- 10 | 11 | 12 | # import tensorflow.keras as keras 13 | from typing import List 14 | 15 | import tensorflow as tf 16 | 17 | 18 | class MNIST_Model(tf.keras.Model): 19 | def __init__(self, input_shape: List[int] = (28, 28, 1), output_shape: int = 10): 20 | super().__init__() 21 | 22 | # LeNet5 23 | self.model = tf.keras.Sequential( 24 | [ 25 | tf.keras.layers.Conv2D( 26 | filters=6, kernel_size=(3, 3), activation="relu", input_shape=(28, 28, 1) 27 | ), 28 | tf.keras.layers.AveragePooling2D(2), 29 | tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3), activation="relu"), 30 | tf.keras.layers.AveragePooling2D(2), 31 | tf.keras.layers.Flatten(), 32 | tf.keras.layers.Dense(units=120, activation="relu"), 33 | tf.keras.layers.Dense(units=84, activation="relu"), 34 | tf.keras.layers.Dense(units=10), 35 | ] 36 | ) 37 | 38 | def call(self, inputs): 39 | return self.model(inputs) 40 | -------------------------------------------------------------------------------- /env-files/torch/jupyter/asyncssh_config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # before it was !/opt/conda/bin/python 3 | # -*- coding: utf-8 -*- 4 | # 5 | # D. Ciangottini 6 | # 7 | import asyncio 8 | import os 9 | import re 10 | import sys 11 | from subprocess import Popen 12 | 13 | import asyncssh 14 | from jupyterhub.singleuser import main 15 | 16 | ssh_host = os.environ.get("JHUB_HOST") 17 | ssh_url_port = os.environ.get("SSH_PORT") 18 | username = os.environ.get("JUPYTERHUB_USER") 19 | token = os.environ.get("JUPYTERHUB_API_TOKEN") 20 | 21 | fwd_port = os.environ.get("FWD_PORT") 22 | 23 | 24 | async def run_client(): 25 | async with asyncssh.connect( 26 | host=ssh_host, 27 | port=int(ssh_url_port), 28 | username=username, 29 | password=token, 30 | known_hosts=None, 31 | ) as conn: 32 | conn.set_keepalive(interval=14.0, count_max=10) 33 | listener = await conn.forward_remote_port( 34 | "0.0.0.0", 35 | int(fwd_port), 36 | "0.0.0.0", 37 | int(fwd_port), 38 | ) 39 | await listener.wait_closed() 40 | 41 | 42 | if __name__ == "__main__": 43 | print("Connecting ssh...") 44 | loop = asyncio.get_event_loop() 45 | loop.create_task(run_client()) 46 | 47 | print("Configuring Rucio extension...") 48 | p = Popen(["/usr/local/bin/setup.sh"]) 49 | while p.poll() is None: 50 | pass 51 | 52 | print("Starting JLAB") 53 | sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0]) 54 | sys.exit(main()) 55 | -------------------------------------------------------------------------------- /env-files/torch/README.md: -------------------------------------------------------------------------------- 1 | # Container image definition files for PyTorch-based itwinai 2 | 3 | ## Singularity 4 | 5 | This example is for building the itwinai container for LUMI (AMD GPUs) locally (use `scp` to transfer the final image 6 | to LUMI) 7 | 8 | First navigate with `cd` to the base folder of itwinai. 9 | 10 | From there, download the singularity base image for pytorch with ROCm: 11 | 12 | ```bash 13 | singularity pull rocm-base-pytorch.sif REGISTRY_IMG 14 | ``` 15 | 16 | You can choose the following base images: 17 | 18 | - `oras://registry.egi.eu/dev.intertwin.eu/itwinai-dev:lumi-pytorch-rocm-6.1.3-python-3.12-pytorch-v2.4.1` 19 | - `oras://registry.cern.ch/itwinai/lumi:lumi-pytorch-rocm-6.2.4-python-3.12-pytorch-v2.6.0-dockerhash-ef203c810cc9` 20 | 21 | Other base images can be found on LUMI at `/appl/local/containers/tested-containers` and 22 | `/appl/local/containers/sif-images`. See the 23 | [docs](https://lumi-supercomputer.github.io/LUMI-EasyBuild-docs/p/PyTorch/#getting-the-container-image) 24 | for more info. 25 | 26 | Then build the final container with: 27 | 28 | ```bash 29 | sudo singularity build --tmpdir /tmp itwinai-lumi-dev.sif env-files/torch/rocm.def 30 | ``` 31 | 32 | - `/tmp` is a location with enough storage space to support the build. 33 | 34 | Available itwinai images can be found at: 35 | 36 | - `oras://registry.egi.eu/dev.intertwin.eu/itwinai-dev:lumi-itwinai-pytorch-rocm-6.1.3-python-3.12-pytorch-v2.4.1` 37 | - `oras://registry.cern.ch/itwinai/lumi:itwinai0.3.3-lumi-pytorch-rocm-6.2.4-python-3.12-pytorch-v2.6.0-dockerhash-ef203c810cc9` 38 | -------------------------------------------------------------------------------- /docs/testing-with-pytest.md: -------------------------------------------------------------------------------- 1 | # Test with `pytest` 2 | 3 | Do this only if you are a developer wanting to test your code with pytest. 4 | 5 | First, you need to create virtual environments both for torch and tensorflow, 6 | following the instructions above, depending on the system that you are using 7 | (e.g., JSC). 8 | 9 | To select the name of the torch and tf environments in which the tests will be 10 | executed you can set the following environment variables. 11 | If these env variables are not set, the testing suite will assume that the 12 | PyTorch environment is under 13 | `.venv-pytorch` and the TensorFlow environment is under `.venv-tf`. 14 | 15 | ```bash 16 | export TORCH_ENV="my_torch_env" 17 | export TF_ENV="my_tf_env" 18 | ``` 19 | 20 | Functional tests (marked with `pytest.mark.functional`) will be executed under 21 | `/tmp/pytest` location to guarantee isolation among tests. 22 | 23 | To run functional tests use: 24 | 25 | ```bash 26 | pytest -v tests/ -m "functional" 27 | ``` 28 | 29 | > [!NOTE] 30 | > Depending on the system that you are using, we implemented a tailored Makefile 31 | > target to run the test suite on it. Read these instructions until the end! 32 | 33 | We provide some Makefile targets to run the whole test suite including unit, integration, 34 | and functional tests. Choose the right target depending on the system that you are using: 35 | 36 | Makefile targets: 37 | 38 | - Juelich Supercomputer (JSC): `test-jsc` 39 | - In any other case: `test` 40 | 41 | For instance, to run the test suite on your laptop user: 42 | 43 | ```bash 44 | make test 45 | ``` 46 | -------------------------------------------------------------------------------- /use-cases/virgo/slurm_config.yaml: -------------------------------------------------------------------------------- 1 | # If you use this with in the runall or scaling mode, keep in mind that the strategies 2 | # will change, as well as the number of nodes. 3 | # 4 | # Default arguments can be seen in src/itwinai/slurm/utils.py 5 | 6 | mode: single # "single", "runall" or "scaling-test" - defaults to "single" 7 | dist_strat: ddp # "ddp", "deepspeed" or "horovod" 8 | 9 | account: intertwin 10 | time: 00:30:00 11 | partition: develbooster 12 | 13 | num_nodes: 1 14 | num_tasks_per_node: 1 15 | gpus_per_node: 4 16 | cpus_per_task: 16 17 | 18 | # Keep in mind that these will be overwritten if "mode" is not "single", and that 19 | # if you override the dist_strat in the CLI, then these will already have evaluated 20 | # and thus might not correspond. Thus, we suggest you only change the dist_strat in 21 | # the config and avoid overriding it in the CLI. 22 | std_out: slurm_job_logs/${dist_strat}-${num_nodes}x${gpus_per_node}.out 23 | err_out: slurm_job_logs/${dist_strat}-${num_nodes}x${gpus_per_node}.err 24 | job_name: virgo-${dist_strat}-job 25 | ################################## 26 | 27 | python_venv: ../../.venv 28 | pipe_key: training_pipeline 29 | config_path: . 30 | config_name: config 31 | 32 | 33 | # The different number of nodes to use for the scalability testing 34 | scalability_nodes: "1, 2, 4" 35 | 36 | # If you want to manually override the training command, comment in the following: 37 | training_cmd: "$(which itwinai) exec-pipeline \ 38 | --config-path {config_path} \ 39 | --config-name {config_name} 40 | +pipe_key={pipe_key} \ 41 | strategy={dist_strat}" 42 | -------------------------------------------------------------------------------- /env-files/torch/install-horovod-deepspeed-cuda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # -------------------------------------------------------------------------------------- 4 | # Part of the interTwin Project: https://www.intertwin.eu/ 5 | # 6 | # Created by: Jarl Sondre Sæther 7 | # 8 | # Credit: 9 | # - Jarl Sondre Sæther - CERN 10 | # - Matteo Bunino - CERN 11 | # -------------------------------------------------------------------------------------- 12 | 13 | set -e 14 | 15 | # DeepSpeed variables 16 | export DS_BUILD_CCL_COMM=1 17 | export DS_BUILD_UTILS=1 18 | export DS_BUILD_AIO=1 19 | export DS_BUILD_FUSED_ADAM=1 20 | export DS_BUILD_FUSED_LAMB=1 21 | export DS_BUILD_TRANSFORMER=1 22 | export DS_BUILD_STOCHASTIC_TRANSFORMER=1 23 | export DS_BUILD_TRANSFORMER_INFERENCE=1 24 | 25 | # Use --no-cache-dir to avoid caching packages in your $HOME, which may have small disk quota 26 | uv pip install --no-cache-dir --no-build-isolation "deepspeed==0.16.8" 27 | 28 | # Horovod variables 29 | export LDSHARED="$CC -shared" && 30 | export CMAKE_CXX_STANDARD=17 31 | 32 | export HOROVOD_MPI_THREADS_DISABLE=1 33 | export HOROVOD_CPU_OPERATIONS=MPI 34 | 35 | export HOROVOD_GPU_ALLREDUCE=NCCL 36 | export HOROVOD_NCCL_LINK=SHARED 37 | export HOROVOD_NCCL_HOME=$EBROOTNCCL 38 | 39 | export HOROVOD_WITH_PYTORCH=1 40 | export HOROVOD_WITHOUT_TENSORFLOW=1 41 | export HOROVOD_WITHOUT_MXNET=1 42 | 43 | uv pip install --no-cache-dir --no-build-isolation git+https://github.com/horovod/horovod.git@3a31d93 44 | 45 | echo "Finished Horovod and DeepSpeed installation script!" 46 | -------------------------------------------------------------------------------- /docs/tutorials/distrib-ml/torch-tutorial-containers.rst: -------------------------------------------------------------------------------- 1 | itwinai and containers (Docker and Singularity) 2 | =================================================== 3 | 4 | In this tutorial you will learn how to use itwinai's containers images to run your ML workflows 5 | without having to setup the python environment by means of virtual environments. 6 | 7 | .. include:: ../../../tutorials/distributed-ml/torch-tutorial-containers/README.md 8 | :parser: myst_parser.sphinx_ 9 | 10 | 11 | Shell scripts 12 | -------------- 13 | 14 | run_docker.sh 15 | ++++++++++++++++ 16 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-containers/run_docker.sh 17 | :language: bash 18 | 19 | slurm.sh 20 | ++++++++++++ 21 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-containers/slurm.sh 22 | :language: bash 23 | 24 | 25 | runall.sh 26 | ++++++++++++++++ 27 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-containers/runall.sh 28 | :language: bash 29 | 30 | 31 | Pipeline configuration 32 | ----------------------- 33 | 34 | config.yaml 35 | ++++++++++++ 36 | 37 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-containers/config.yaml 38 | :language: yaml 39 | 40 | 41 | Python files 42 | ------------------ 43 | 44 | model.py 45 | ++++++++++++ 46 | 47 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-containers/model.py 48 | :language: python 49 | 50 | dataloader.py 51 | +++++++++++++++ 52 | .. literalinclude:: ../../../tutorials/distributed-ml/torch-tutorial-containers/dataloader.py 53 | :language: python 54 | 55 | 56 | -------------------------------------------------------------------------------- /docs/use-cases/3dgan_doc.rst: -------------------------------------------------------------------------------- 1 | Fast particle detector simulation (CERN) 2 | ======================================== 3 | 4 | This use case trains a 3D Generative Adversarial Network (3DGAN) for 5 | generation of images of calorimeter depositions. It is based on the 6 | prototype `3DGAN `_ model 7 | developed at CERN and is implemented on PyTorch Lightning framework. 8 | 9 | This section covers the CERN use case that utilizes the `torch-lightning` framework 10 | for training and evaluation. Following you can find instructions to execute CERN use 11 | case and its integral scripts: 12 | 13 | Integration with itwinai 14 | ------------------------ 15 | 16 | .. include:: ../../use-cases/3dgan/README.md 17 | :parser: myst_parser.sphinx_ 18 | :start-line: 2 19 | 20 | 21 | 3DGAN plugin for itwinai 22 | ------------------------ 23 | 24 | The integration code of the 3DGAN model has been adapted to be distributed as an independent 25 | itwinai plugin called `itwinai-3dgan-plugin `_. 26 | 27 | 28 | Offloading jobs via interLink 29 | ----------------------------- 30 | 31 | The CERN use case also has an integration with `interLink `_. You can find 32 | the relevant files in the 33 | `interLink directory on Github `_. 34 | You can also look at the README for more information: 35 | 36 | 37 | .. include:: ../../use-cases/3dgan/interLink/README.md 38 | :parser: myst_parser.sphinx_ 39 | :start-line: 0 40 | -------------------------------------------------------------------------------- /use-cases/3dgan/slurm.jsc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # SLURM jobscript for JSC systems 4 | 5 | # general configuration of the job 6 | #SBATCH --job-name=PrototypeTest 7 | #SBATCH --account=intertwin 8 | #SBATCH --mail-user= 9 | #SBATCH --mail-type=ALL 10 | #SBATCH --output=job.out 11 | #SBATCH --error=job.err 12 | #SBATCH --time=00:30:00 13 | 14 | # configure node and process count on the CM 15 | #SBATCH --partition=batch 16 | #SBATCH --nodes=2 17 | #SBATCH --ntasks-per-node=1 18 | #SBATCH --cpus-per-task=4 19 | #SBATCH --gpus-per-node=4 20 | 21 | #SBATCH --exclusive 22 | 23 | # gres options have to be disabled for deepv 24 | #SBATCH --gres=gpu:4 25 | 26 | # load modules 27 | ml --force purge 28 | ml Stages/2024 GCC CUDA/12 cuDNN Python 29 | # ml Stages/2024 GCC OpenMPI CUDA/12 cuDNN MPI-settings/CUDA 30 | # ml Python CMake HDF5 PnetCDF libaio mpi4py 31 | 32 | # shellcheck source=/dev/null 33 | source ~/.bashrc 34 | 35 | # Activate the environment 36 | source ../../envAI_hdfml/bin/activate 37 | 38 | GAN_DATASET="exp_data" #"/p/scratch/intertwin/datasets/cern/" 39 | 40 | # launch training 41 | TRAINING_CMD="$(which itwinai) exec-pipeline num_nodes=$SLURM_NNODES \ 42 | dataset_location=$GAN_DATASET " 43 | 44 | srun --cpu-bind=none --ntasks-per-node=1 \ 45 | bash -c "torchrun \ 46 | --log_dir='logs_torchrun' \ 47 | --nnodes=$SLURM_NNODES \ 48 | --nproc_per_node=$SLURM_GPUS_PER_NODE \ 49 | --rdzv_id=$SLURM_JOB_ID \ 50 | --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ 51 | --rdzv_backend=c10d \ 52 | --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ 53 | $TRAINING_CMD " -------------------------------------------------------------------------------- /env-files/torch/generic_torch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # -------------------------------------------------------------------------------------- 4 | # Part of the interTwin Project: https://www.intertwin.eu/ 5 | # 6 | # Created by: Matteo Bunino 7 | # 8 | # Credit: 9 | # - Jarl Sondre Sæther - CERN 10 | # - Matteo Bunino - CERN 11 | # -------------------------------------------------------------------------------------- 12 | 13 | if [ -z "$ENV_NAME" ]; then 14 | ENV_NAME=".venv-pytorch" 15 | fi 16 | 17 | work_dir=$PWD 18 | 19 | # Create the python venv if it doesn't already exist 20 | if [ -d "${work_dir}/$ENV_NAME" ];then 21 | echo "env $ENV_NAME already exists" 22 | else 23 | python3 -m venv $ENV_NAME 24 | echo "$ENV_NAME environment is created in ${work_dir}" 25 | fi 26 | 27 | # Activate the venv and then install itwinai as editable 28 | source $ENV_NAME/bin/activate 29 | pip install uv 30 | 31 | if [ -z "$NO_CUDA" ]; then 32 | # Install with CUDA support 33 | uv pip install -e ".[torch,dev]" \ 34 | --no-cache-dir \ 35 | --extra-index-url https://download.pytorch.org/whl/cu126 36 | else 37 | # Install without CUDA support (avoid uv here) 38 | pip install -e ".[torch,dev]" \ 39 | --no-cache-dir \ 40 | --extra-index-url https://download.pytorch.org/whl/cpu 41 | fi 42 | 43 | 44 | # Install Prov4ML 45 | if [[ "$(uname)" == "Darwin" ]]; then 46 | uv pip install --no-cache-dir "prov4ml[apple]@git+https://github.com/matbun/ProvML@v0.0.2" 47 | else 48 | # Assuming Nvidia GPUs are available 49 | uv pip install --no-cache-dir "prov4ml[nvidia]@git+https://github.com/matbun/ProvML@v0.0.2" 50 | fi 51 | -------------------------------------------------------------------------------- /src/itwinai/constants.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Linus Eickhoff 5 | # 6 | # Credit: 7 | # - Linus Eickhoff - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | """constants used in the itwinai project""" 11 | from pathlib import Path 12 | 13 | # Directory names for logging and profiling data 14 | PROFILER_TRACES_DIR_NAME = "profiler-traces" 15 | 16 | # mlflow 17 | RELATIVE_MLFLOW_PATH = Path("mllogs/mlflow") 18 | BASE_EXP_NAME: str = "unnamed-experiment" 19 | PROFILING_AVG_NAME: str = "torch_profiling_averages" 20 | 21 | adjectives = [ 22 | "quantum", 23 | "relativistic", 24 | "wavy", 25 | "entangled", 26 | "chiral", 27 | "tachyonic", 28 | "superluminal", 29 | "anomalous", 30 | "hypercharged", 31 | "fermionic", 32 | "hadronic", 33 | "quarky", 34 | "holographic", 35 | "dark", 36 | "force-sensitive", 37 | "chaotic", 38 | ] 39 | 40 | names = [ 41 | "neutrino", 42 | "graviton", 43 | "muon", 44 | "gluon", 45 | "tachyon", 46 | "quasar", 47 | "pulsar", 48 | "blazar", 49 | "meson", 50 | "boson", 51 | "hyperon", 52 | "starlord", 53 | "groot", 54 | "rocket", 55 | "yoda", 56 | "skywalker", 57 | "sithlord", 58 | "midichlorian", 59 | "womp-rat", 60 | "beskar", 61 | "mandalorian", 62 | "ewok", 63 | "vibranium", 64 | "nova", 65 | "gamora", 66 | "drax", 67 | "ronan", 68 | "thanos", 69 | "cosmo", 70 | ] 71 | -------------------------------------------------------------------------------- /use-cases/mnist/torch/create_inference_sample.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | """Create a simple inference dataset sample and a checkpoint.""" 11 | 12 | import argparse 13 | import os 14 | 15 | import torch 16 | from dataloader import InferenceMNIST 17 | from model import Net 18 | 19 | 20 | def mnist_torch_inference_files( 21 | root: str = ".", 22 | samples_path: str = "mnist-sample-data/", 23 | model_name: str = "mnist-pre-trained.pth", 24 | ): 25 | """Create sample dataset and fake model to test mnist 26 | inference workflow. Assumes to be run from 27 | the use case folder. 28 | 29 | Args: 30 | root (str, optional): where to create the files. 31 | Defaults to '.'. 32 | """ 33 | 34 | sample = os.path.join(root, samples_path) 35 | InferenceMNIST.generate_jpg_sample(sample, 10) 36 | 37 | # Fake checkpoint 38 | dummy_nn = Net() 39 | mdl_ckpt = os.path.join(root, model_name) 40 | torch.save(dummy_nn, mdl_ckpt) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | parser.add_argument("--root", type=str, default=".") 46 | parser.add_argument("--samples-path", type=str, default="mnist-sample-data") 47 | parser.add_argument("--model-name", type=str, default="mnist-pre-trained.pth") 48 | args = parser.parse_args() 49 | mnist_torch_inference_files(**vars(args)) 50 | -------------------------------------------------------------------------------- /docs/installation/user_installation.rst: -------------------------------------------------------------------------------- 1 | User Installation (for Non-Developers) 2 | ====================================== 3 | 4 | This guide provides step-by-step instructions for installing the ``itwinai`` library for 5 | users. 6 | 7 | .. The explanation for creating a venv is the same for developers and users 8 | .. include:: ./software_prerequisites.rst 9 | 10 | 11 | Installing the ``itwinai`` Library 12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 13 | You can choose if you want to install ``itwinai`` with support for either PyTorch or 14 | TensorFlow by using extras: 15 | 16 | .. tab-set:: 17 | 18 | .. tab-item:: PyTorch 19 | 20 | To install ``itwinai`` with PyTorch without GPU acceleration, you can use the 21 | following command: 22 | 23 | .. code-block:: bash 24 | 25 | uv pip install "itwinai[torch]" 26 | 27 | To enable GPU acceleration, you can use the following command: 28 | 29 | .. code-block:: bash 30 | 31 | uv pip install "itwinai[torch]" \ 32 | --extra-index-url https://download.pytorch.org/whl/cu121 33 | 34 | 35 | .. tab-item:: TensorFlow 36 | 37 | To install ``itwinai`` with TensorFlow without GPU acceleration, you can use the 38 | following command: 39 | 40 | .. code-block:: bash 41 | 42 | uv pip install "itwinai[tf]" 43 | 44 | To enable GPU acceleration, you can use the following command: 45 | 46 | .. code-block:: bash 47 | 48 | uv pip install "itwinai[tf-cuda]" 49 | 50 | .. Explanation for installing horovod, DS, and other packages that need to be installed AFTER itwinai 51 | .. include:: ./post_itwinai_installation.rst 52 | -------------------------------------------------------------------------------- /.github/linters/.ruff.toml: -------------------------------------------------------------------------------- 1 | line-length = 95 2 | 3 | [lint] 4 | select = [ 5 | "E", # pycodestyle errors 6 | "F", # pyflakes: undefined names, unused imports, etc. 7 | "I", # isort: import sorting 8 | "W", # pycodestyle warnings 9 | "B", # flake8-bugbear: likely bugs and bad practices (e.g. mutable defaults) 10 | "C4", # flake8-comprehensions: unnecessary or suboptimal comprehensions 11 | "SIM", # flake8-simplify: redundant ifs, returns, boolean logic 12 | "UP", # pyupgrade: use modern Python syntax (e.g. f-strings, `Path()` literals) 13 | "PTH", # flake8-use-pathlib: use pathlib instead of os.path 14 | "N", # pep8-naming: naming conventions for classes, functions, variables 15 | ] 16 | ignore = [ 17 | "E203", # Whitespace before ':' – conflicts with Black 18 | "PTH109", # Allow os.getcwd() 19 | "PTH122", # Avoid replacing os.path.splitext – Path.suffix drops info (e.g. .tar.gz) 20 | "PTH123", # Allow use of builtin open() – Path.open() adds no real benefit 21 | "UP006", # Keep using typing.List/Dict/Set – prefer consistency over builtin generics 22 | "UP035", # Same as above – avoid auto-converting to list[]/dict[] syntax 23 | "B904", # Don't require `from err` in CLI code – breaks Typer/Click behavior 24 | "SIM108", # Don't always use ternary operators — they can be kind of hard to read sometimes 25 | "N806", # Allow UPPER_CASE_VARIABLE_NAMES in function scopes (for default values etc.) 26 | "N812", # Allow importing stuff as uppercase (e.g. function as F) 27 | ] 28 | fixable = ["ALL"] 29 | 30 | [format] 31 | quote-style = "double" 32 | indent-style = "space" 33 | skip-magic-trailing-comma = false 34 | line-ending = "auto" 35 | -------------------------------------------------------------------------------- /use-cases/mnist/tensorflow/pipeline.yaml: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Roman Machacek 5 | # 6 | # Credit: 7 | # - Roman Machacek - CERN 8 | # - Matteo Bunino - CERN 9 | # -------------------------------------------------------------------------------------- 10 | 11 | # General config 12 | verbose: auto 13 | micro_batch_size: 17 14 | epochs: 3 15 | checkpoints_path: checkpoints 16 | tb_log_dir: ./logs 17 | 18 | # Training pipeline 19 | pipeline: 20 | _target_: itwinai.pipeline.Pipeline 21 | steps: 22 | - _target_: dataloader.MNISTDataGetter 23 | - _target_: dataloader.MNISTDataPreproc 24 | classes: 10 25 | - _target_: itwinai.tensorflow.trainer.TensorflowTrainer 26 | epochs: ${epochs} 27 | micro_batch_size: ${micro_batch_size} 28 | verbose: ${verbose} 29 | model_compile_config: 30 | loss: 31 | _target_: tensorflow.keras.losses.CategoricalCrossentropy 32 | from_logits: False 33 | optimizer: 34 | _target_: tensorflow.keras.optimizers.Adam 35 | learning_rate: 0.001 36 | model_config: 37 | _target_: itwinai.tensorflow.models.mnist.MNIST_Model 38 | input_shape: [28, 28, 1] 39 | output_shape: 10 40 | callbacks: 41 | - _target_: keras.callbacks.EarlyStopping 42 | patience: 2 43 | - _target_: keras.callbacks.ModelCheckpoint 44 | filepath: ${checkpoints_path}/model.{epoch:02d}-{val_loss:.2f}.keras 45 | - _target_: keras.callbacks.TensorBoard 46 | log_dir: ${tb_log_dir} 47 | 48 | -------------------------------------------------------------------------------- /use-cases/lattice-qcd/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2023 Javad Komijani 2 | 3 | 4 | from setuptools import setup 5 | 6 | 7 | def readme(): 8 | with open('README.rst') as f: 9 | return f.read() 10 | 11 | 12 | packages = [ 13 | 'normflow', 14 | 'normflow.action', 15 | 'normflow.lib', 16 | 'normflow.lib.combo', 17 | 'normflow.lib.indexing', 18 | 'normflow.lib.linalg', 19 | 'normflow.lib.spline', 20 | 'normflow.lib.stats', 21 | 'normflow.mask', 22 | 'normflow.mcmc', 23 | 'normflow.nn', 24 | 'normflow.nn.scalar', 25 | 'normflow.prior' 26 | ] 27 | 28 | package_dir = { 29 | 'normflow': 'src', 30 | 'normflow.action': 'src/action', 31 | 'normflow.lib': 'src/lib', 32 | 'normflow.lib.combo': 'src/lib/combo', 33 | 'normflow.lib.indexing': 'src/lib/indexing', 34 | 'normflow.lib.linalg': 'src/lib/linalg', 35 | 'normflow.lib.spline': 'src/lib/spline', 36 | 'normflow.lib.stats': 'src/lib/stats', 37 | 'normflow.mask': 'src/mask', 38 | 'normflow.mcmc': 'src/mcmc', 39 | 'normflow.nn': 'src/nn', 40 | 'normflow.nn.scalar': 'src/nn/scalar', 41 | 'normflow.prior': 'src/prior' 42 | } 43 | 44 | setup(name='normflow', 45 | version='1.1', 46 | description='Normalizing flow for generating lattice field configurations', 47 | packages=packages, 48 | package_dir=package_dir, 49 | url='http://github.com/jkomijani/normflow', 50 | author='Javad Komijani', 51 | author_email='jkomijani@gmail.com', 52 | license='MIT', 53 | install_requires=['numpy>=1.20', 'torch>=2.0'], 54 | zip_safe=False 55 | ) 56 | -------------------------------------------------------------------------------- /docs/how-it-works/training/explain_ddp.rst: -------------------------------------------------------------------------------- 1 | Explanation of Distributed Data Parallelism 2 | ------------------------------------------- 3 | 4 | **Author(s)**: Killian Verder (CERN), Matteo Bunino (CERN) 5 | 6 | Deep neural networks (DNN) are often extremely large and are trained on massive amounts 7 | of data, more than most computers have memory for. Even smaller DNNs can take days to 8 | train. Distributed Data Parallel (DDP) addresses these two issues, long training times 9 | and limited memory, by using multiple machines to host and train both model and data. 10 | 11 | Data parallelism is an easy way for a developer to vastly reduce training times. Rather 12 | than using single-node parallelism, DDP scales to multiple machines. This scaling 13 | maximises parallelisation of your model and drastically reduces training times. 14 | 15 | Another benefit of DDP is removal of single-machine memory constraints. Since a dataset 16 | or model can be stored across several machines, it becomes possible to analyse much 17 | larger datasets or models. 18 | 19 | Below is a list of resources expanding on theoretical aspects and practical 20 | implementations of DDP: 21 | 22 | * Introduction to DP: https://siboehm.com/articles/22/data-parallel-training 23 | 24 | * https://pytorch.org/tutorials/beginner/ddp_series_theory.html 25 | 26 | * https://pytorch.org/tutorials/intermediate/ddp_tutorial.html 27 | 28 | * https://huggingface.co/blog/pytorch-ddp-accelerate-transformers 29 | 30 | 31 | Data-Parallelism with Deepspeed's Zero Redundancy Optimizer (ZeRO): 32 | 33 | * https://sumanthrh.com/post/distributed-and-efficient-finetuning/#zero-powered-data-parallelism 34 | 35 | 36 | Investigation of expected performance improvement: 37 | 38 | * https://www.mdpi.com/2079-9292/11/10/1525 39 | 40 | -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-tutorial-containers/runall.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # -------------------------------------------------------------------------------------- 4 | # Part of the interTwin Project: https://www.intertwin.eu/ 5 | # 6 | # Created by: Matteo Bunino 7 | # 8 | # Credit: 9 | # - Matteo Bunino - CERN 10 | # -------------------------------------------------------------------------------------- 11 | 12 | # Clear SLURM logs (*.out and *.err files) 13 | rm -rf logs_slurm 14 | mkdir logs_slurm 15 | rm -rf logs_torchrun 16 | 17 | # DDP itwinai 18 | DIST_MODE="ddp" 19 | RUN_NAME="ddp-itwinai" 20 | TRAINING_CMD='/usr/local/bin/itwinai exec-pipeline strategy=ddp' 21 | sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD" \ 22 | --job-name="$RUN_NAME-n$N" \ 23 | --output="logs_slurm/job-$RUN_NAME-n$N.out" \ 24 | --error="logs_slurm/job-$RUN_NAME-n$N.err" \ 25 | slurm.sh 26 | 27 | # DeepSpeed itwinai 28 | DIST_MODE="deepspeed" 29 | RUN_NAME="deepspeed-itwinai" 30 | TRAINING_CMD='/usr/local/bin/itwinai exec-pipeline strategy=deepspeed' 31 | sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD" \ 32 | --job-name="$RUN_NAME-n$N" \ 33 | --output="logs_slurm/job-$RUN_NAME-n$N.out" \ 34 | --error="logs_slurm/job-$RUN_NAME-n$N.err" \ 35 | slurm.sh 36 | 37 | # # Horovod itwinai 38 | # DIST_MODE="horovod" 39 | # RUN_NAME="horovod-itwinai" 40 | # TRAINING_CMD='/usr/local/bin/itwinai exec-pipeline strategy=horovod' 41 | # sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD" \ 42 | # --job-name="$RUN_NAME-n$N" \ 43 | # --output="logs_slurm/job-$RUN_NAME-n$N.out" \ 44 | # --error="logs_slurm/job-$RUN_NAME-n$N.err" \ 45 | # slurm.sh -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-tutorial-1-mnist/README.md: -------------------------------------------------------------------------------- 1 | # Tutorial: distributed strategies for PyTorch model trained on MNIST dataset 2 | 3 | **Author(s)**: Matteo Bunino (CERN), Jarl Sondre Sæther (CERN) 4 | 5 | In this tutorial we show how to use torch `DistributedDataParallel` (DDP), Horovod and 6 | DeepSpeed from the same client code. 7 | Note that the environment is tested on the HDFML system at JSC. For other systems, 8 | the module versions might need change accordingly. 9 | 10 | ## Setup 11 | 12 | First, from the root of this repository, build the environment containing 13 | pytorch, horovod and deepspeed. You can *try* with: 14 | 15 | ```bash 16 | # Creates a Python venv called envAI_hdfml 17 | make torch-gpu-jsc 18 | ``` 19 | 20 | Before launching training, since on JSC's compute nodes there is not internet connection, 21 | you need to download the dataset before while on the login lode: 22 | 23 | ```bash 24 | source ../../../envAI_hdfml/bin/activate 25 | python train.py --download-only 26 | ``` 27 | 28 | This command creates a local folder called "MNIST" with the dataset. 29 | 30 | ## Distributed training 31 | 32 | You can run your training with SLURM by using the `itwinai` SLURM Builder. Use the 33 | `slurm_config.yaml` file to specify your SLURM parameters and then preview your script 34 | with the following command: 35 | 36 | ```bash 37 | itwinai generate-slurm -c slurm_config.yaml --no-save-script --no-submit-job 38 | ``` 39 | 40 | If you are happy with the script, you can then run it by omitting `--no-submit-job`: 41 | 42 | ```bash 43 | itwinai generate-slurm -c slurm_config.yaml --no-save-script 44 | ``` 45 | 46 | If you want to store a copy of the script in a folder, then you can similarly omit 47 | `--no-save-script`: 48 | 49 | ```bash 50 | itwinai generate-slurm -c slurm_config.yaml 51 | ``` 52 | -------------------------------------------------------------------------------- /src/itwinai/torch/reproducibility.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | """This module provides the tools to support reproducible execution of torch scripts.""" 11 | 12 | import random 13 | from typing import Optional 14 | 15 | import numpy as np 16 | import torch 17 | 18 | 19 | def seed_worker(worker_id): 20 | """Seed DataLoader worker.""" 21 | worker_seed = torch.initial_seed() % 2**32 22 | np.random.seed(worker_seed) 23 | random.seed(worker_seed) 24 | 25 | 26 | def set_seed(rnd_seed: Optional[int], deterministic_cudnn: bool = True) -> torch.Generator: 27 | """Set torch random seed and return a PRNG object. 28 | 29 | Args: 30 | rnd_seed (Optional[int]): random seed. If None, the seed is not set. 31 | deterministic_cudnn (bool): if True, sets 32 | ``torch.backends.cudnn.benchmark = False``, which may affect 33 | performances. 34 | 35 | Returns: 36 | torch.Generator: PRNG object. 37 | """ 38 | g = torch.Generator() 39 | if rnd_seed is not None: 40 | # Deterministic execution 41 | np.random.seed(rnd_seed) 42 | random.seed(rnd_seed) 43 | torch.manual_seed(rnd_seed) 44 | g.manual_seed(rnd_seed) 45 | if torch.cuda.is_available(): 46 | torch.cuda.manual_seed(rnd_seed) 47 | torch.cuda.manual_seed_all(rnd_seed) 48 | if deterministic_cudnn: 49 | torch.backends.cudnn.benchmark = False 50 | torch.backends.cudnn.deterministic = True 51 | return g 52 | -------------------------------------------------------------------------------- /use-cases/mnist/tensorflow/dataloader.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Roman Machacek 5 | # 6 | # Credit: 7 | # - Roman Machacek - CERN 8 | # - Matteo Bunino - CERN 9 | # -------------------------------------------------------------------------------------- 10 | 11 | 12 | from typing import Tuple 13 | 14 | import tensorflow as tf 15 | import tensorflow.keras as keras 16 | 17 | from itwinai.components import DataGetter, DataProcessor, monitor_exec 18 | 19 | 20 | class MNISTDataGetter(DataGetter): 21 | def __init__(self): 22 | super().__init__() 23 | self.save_parameters(**self.locals2params(locals())) 24 | 25 | @monitor_exec 26 | def execute(self) -> Tuple: 27 | train, test = keras.datasets.mnist.load_data() 28 | return train, test 29 | 30 | 31 | class MNISTDataPreproc(DataProcessor): 32 | def __init__(self, classes: int): 33 | super().__init__() 34 | self.save_parameters(**self.locals2params(locals())) 35 | self.classes = classes 36 | 37 | @monitor_exec 38 | def execute( 39 | self, 40 | *datasets, 41 | ) -> Tuple: 42 | options = tf.data.Options() 43 | options.experimental_distribute.auto_shard_policy = ( 44 | tf.data.experimental.AutoShardPolicy.DATA 45 | ) 46 | preprocessed = [] 47 | for dataset in datasets: 48 | x, y = dataset 49 | y = keras.utils.to_categorical(y, self.classes) 50 | sliced = tf.data.Dataset.from_tensor_slices((x, y)) 51 | sliced = sliced.with_options(options) 52 | preprocessed.append(sliced) 53 | return tuple(preprocessed) 54 | -------------------------------------------------------------------------------- /env-files/torch/createEnvJSC.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # -*- coding: utf-8 -*- 3 | 4 | if [ ! -f "env-files/torch/generic_torch.sh" ]; then 5 | echo "ERROR: env-files/torch/generic_torch.sh not found!" 6 | exit 1 7 | fi 8 | 9 | # set dir 10 | cDir=$PWD 11 | 12 | # get sys info 13 | sysN="$(uname -n | cut -f2- -d.)" 14 | sysN="${sysN%%[0-9]*}" 15 | 16 | # load modules 17 | # NOTE: REFLECT THEM IN THE MAIN README! 18 | ml --force purge 19 | ml Stages/2025 GCC OpenMPI CUDA/12 cuDNN MPI-settings/CUDA 20 | ml Python CMake HDF5 PnetCDF libaio mpi4py git 21 | 22 | # Create and install torch env 23 | export ENV_NAME="envAI_$sysN" 24 | bash env-files/torch/generic_torch.sh 25 | source $ENV_NAME/bin/activate 26 | 27 | # fix IB IP config - FZJ specific 28 | if [ -f "${cDir}/envAI_${sysN}/bin/torchrun" ]; then 29 | sed -i -e '5,100s/^/#/' ${cDir}/envAI_${sysN}/bin/torchrun 30 | echo """ 31 | import re 32 | import sys 33 | from torch.distributed.run import main 34 | from torch.distributed.elastic.agent.server import api as sapi 35 | 36 | def new_get_fq_hostname(): 37 | return _orig_get_fq_hostname().replace('.', 'i.', 1) 38 | 39 | if __name__ == '__main__': 40 | _orig_get_fq_hostname = sapi._get_fq_hostname 41 | sapi._get_fq_hostname = new_get_fq_hostname 42 | sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) 43 | sys.exit(main()) 44 | """ >> ${cDir}/envAI_${sysN}/bin/torchrun 45 | fi 46 | 47 | # JUBE benchmarking environment 48 | if [ -f "${cDir}/envAI_${sysN}/bin/jube" ]; then 49 | echo 'JUBE already installed' 50 | else 51 | pip3 install --no-cache-dir http://apps.fz-juelich.de/jsc/jube/jube2/download.php?version=latest 52 | fi 53 | 54 | # some tests 55 | echo "unit tests:" 56 | for item in 'torch' 'deepspeed' 'horovod';do 57 | python3 -c "import $item; print('$item version:',$item.__version__)" 58 | done 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.formatOnSave": true, 3 | "editor.defaultFormatter": null, 4 | "editor.rulers": [ 5 | 95 6 | ], 7 | "cSpell.ignoreWords": [ 8 | "itwinpreproc", 9 | "typer" 10 | ], 11 | "cSpell.words": [ 12 | "argmax", 13 | "autolog", 14 | "Convolutional", 15 | "cuda", 16 | "dataloaders", 17 | "dataloading", 18 | "fromlist", 19 | "hyperparameters", 20 | "hyperparams", 21 | "imagenet", 22 | "ipython", 23 | "itwinai", 24 | "Lockfiles", 25 | "logfiles", 26 | "logits", 27 | "Mambaforge", 28 | "Micromamba", 29 | "mlflow", 30 | "mnist", 31 | "multiclass", 32 | "mypackage", 33 | "NCCL", 34 | "omegaconf", 35 | "optim", 36 | "plmodels", 37 | "preds", 38 | "preproc", 39 | "pytest", 40 | "pyyaml", 41 | "relu", 42 | "Roadmap", 43 | "savedir", 44 | "SLURM", 45 | "softmax", 46 | "tensorboard", 47 | "torchmetrics", 48 | "torchvision", 49 | "venv", 50 | "wandb" 51 | ], 52 | "markdownlint.run": "onType", 53 | "markdownlint.config": { 54 | "MD013": { 55 | "line_length": 120 56 | } 57 | }, 58 | "[python]": { 59 | "editor.defaultFormatter": "charliermarsh.ruff" 60 | }, 61 | "python.testing.pytestArgs": [ 62 | "tests" 63 | ], 64 | "python.testing.unittestEnabled": false, 65 | "python.testing.pytestEnabled": true, 66 | "python.analysis.extraPaths": [ 67 | "./src/itwinai" 68 | ], 69 | "makefile.configureOnOpen": false, 70 | "files.associations": { 71 | "*.log.*": "log", 72 | "*.err": "log", 73 | "*.out": "log" 74 | } 75 | } -------------------------------------------------------------------------------- /docs/api/itwinai.torch.modules.rst: -------------------------------------------------------------------------------- 1 | itwinai.torch 2 | ============= 3 | 4 | 5 | config.py 6 | ++++++++++++++++++ 7 | .. automodule:: itwinai.torch.config 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | :member-order: bysource 12 | 13 | 14 | distributed.py 15 | ++++++++++++++ 16 | .. automodule:: itwinai.torch.distributed 17 | :members: 18 | :undoc-members: 19 | :show-inheritance: 20 | :member-order: bysource 21 | 22 | 23 | gan.py 24 | ++++++++++++++ 25 | .. automodule:: itwinai.torch.gan 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | :member-order: bysource 30 | 31 | 32 | inference.py 33 | ++++++++++++ 34 | .. automodule:: itwinai.torch.inference 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | :member-order: bysource 39 | 40 | 41 | loggers.py 42 | ++++++++++ 43 | .. automodule:: itwinai.torch.loggers 44 | :members: 45 | :undoc-members: 46 | :show-inheritance: 47 | :member-order: bysource 48 | 49 | 50 | mlflow.py 51 | +++++++++ 52 | .. automodule:: itwinai.torch.mlflow 53 | :members: 54 | :undoc-members: 55 | :show-inheritance: 56 | :member-order: bysource 57 | 58 | 59 | reproducibility.py 60 | ++++++++++++++++++ 61 | .. automodule:: itwinai.torch.reproducibility 62 | :members: 63 | :undoc-members: 64 | :show-inheritance: 65 | :member-order: bysource 66 | 67 | 68 | type.py 69 | ++++++++ 70 | .. automodule:: itwinai.torch.type 71 | :members: 72 | :undoc-members: 73 | :show-inheritance: 74 | :member-order: bysource 75 | 76 | 77 | trainer.py 78 | ++++++++++ 79 | .. automodule:: itwinai.torch.trainer 80 | :members: 81 | :undoc-members: 82 | :show-inheritance: 83 | :member-order: bysource 84 | 85 | 86 | tuning.py 87 | ++++++++++ 88 | .. automodule:: itwinai.torch.tuning 89 | :members: 90 | :undoc-members: 91 | :show-inheritance: 92 | :member-order: bysource -------------------------------------------------------------------------------- /tests/use-cases/test_cyclones.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # - Jarl Sondre Sæther - CERN 9 | # -------------------------------------------------------------------------------------- 10 | """Tests for Cyclones use case. 11 | 12 | Intended to be integration tests, to make sure that updates in the code base 13 | do not break use cases' workflows. 14 | """ 15 | 16 | import os 17 | import subprocess 18 | from pathlib import Path 19 | 20 | import pytest 21 | 22 | CYCLONES_PATH = Path("use-cases", "cyclones") 23 | 24 | 25 | @pytest.mark.skip("deprecated") 26 | def test_structure_cyclones(check_folder_structure): 27 | """Test cyclones folder structure.""" 28 | check_folder_structure(CYCLONES_PATH) 29 | 30 | 31 | @pytest.mark.skip("deprecated") 32 | @pytest.mark.functional 33 | @pytest.mark.memory_heavy 34 | def test_cyclones_train_tf(tf_env, install_requirements, tmp_path): 35 | """ 36 | Test Cyclones tensorflow trainer by running it end-to-end. 37 | 38 | If CMCCC_DATASET env variable is defined, it is used to 39 | override the default dataset download location: useful 40 | when it contains a local copy of the dataset, preventing 41 | downloading it again. 42 | """ 43 | # TODO: create a small sample dataset for tests only 44 | install_requirements(CYCLONES_PATH, tf_env) 45 | 46 | dataset_path = os.environ.get("CMCCC_DATASET", "./data/tmp_data") 47 | pipe = CYCLONES_PATH / "pipeline.yaml" 48 | train = CYCLONES_PATH / "train.py" 49 | 50 | cmd = ( 51 | f"{tf_env}/bin/python {train.resolve()} -p {pipe.resolve()} --data_path {dataset_path}" 52 | ) 53 | subprocess.run(cmd.split(), check=True, cwd=tmp_path) 54 | -------------------------------------------------------------------------------- /use-cases/cyclones/pipeline.yaml: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Roman Machacek - CERN 8 | # - Matteo Bunino - CERN 9 | # -------------------------------------------------------------------------------------- 10 | 11 | # General configuration 12 | epochs: 3 13 | micro_batch_size: 32 14 | dataset_url: https://drive.google.com/drive/folders/1TnmujO4T-8_j4bCxqNe5HEw9njJIIBQD #https://drive.google.com/drive/folders/15DEq33MmtRvIpe2bNCg44lnfvEiHcPaf 15 | dataset_root: tmp_cyclones_data 16 | verbose: auto 17 | global_config: null 18 | 19 | # Workflows 20 | training_pipeline: 21 | class_path: itwinai.pipeline.Pipeline 22 | init_args: 23 | steps: 24 | download-step: 25 | class_path: dataloader.CyclonesDataGetter 26 | init_args: 27 | dataset_url: ${dataset_url} 28 | dataset_root: ${dataset_root} 29 | global_config: ${global_config} 30 | patch_type: NEAREST 31 | shuffle: False 32 | split_ratio: [0.75, 0.25] 33 | augment: True 34 | epochs: ${epochs} 35 | target_scale: False 36 | label_no_cyclone: NONE 37 | aug_type: ONLY_TCS 38 | experiment: { 39 | 'DRV_VARS_1': ['fg10', 'msl', 't_500', 't_300'], 40 | 'COO_VARS_1': ['patch_cyclone'], 41 | 'MSK_VAR_1': None 42 | } 43 | 44 | training-step: 45 | class_path: trainer.CyclonesTrainer 46 | init_args: 47 | epochs: ${epochs} 48 | micro_batch_size: ${micro_batch_size} 49 | global_config: ${global_config} 50 | network: VGG_V1 51 | activation: LINEAR 52 | regularization_strength: NONE 53 | learning_rate: 0.0001 54 | loss: MAE 55 | verbose: ${verbose} -------------------------------------------------------------------------------- /use-cases/lattice-qcd/config.yaml: -------------------------------------------------------------------------------- 1 | # General configuration 2 | batch_size: 128 3 | epochs: 100 4 | optim_lr: 0.001 5 | weight_decay: 0.01 6 | knots_len: 10 7 | symmetric: True 8 | shape: [1] 9 | kappa: 0 10 | m_sq: -1.2 11 | lambd: 0.5 12 | ckpt_disp: False 13 | save_every: None 14 | optimizer_class: torch.optim.AdamW 15 | loss_fn: None 16 | scheduler: None 17 | print_stride: 10 18 | print_batch_size: 1024 19 | snapshot_path: null 20 | epochs_run: 0 21 | strategy: 'ddp' 22 | 23 | training_pipeline: 24 | _target_: itwinai.pipeline.Pipeline 25 | steps: 26 | - _target_: normflow.Fitter 27 | model: 28 | _target_: normflow.Model 29 | net_: 30 | _target_: normflow.nn.DistConvertor_ 31 | knots_len: ${knots_len} 32 | symmetric: ${symmetric} 33 | prior: 34 | _target_: normflow.prior.NormalPrior 35 | shape: ${shape} 36 | action: 37 | _target_: normflow.action.ScalarPhi4Action 38 | kappa: ${kappa} 39 | m_sq: ${m_sq} 40 | lambd: ${lambd} 41 | config: 42 | optim_lr: ${optim_lr} 43 | weight_decay: ${weight_decay} 44 | save_every: ${save_every} 45 | ckpt_disp: ${ckpt_disp} 46 | batch_size: ${batch_size} 47 | optimizer_class: ${optimizer_class} 48 | scheduler: ${scheduler} 49 | loss_fn: ${loss_fn} 50 | print_stride: ${print_stride} 51 | print_batch_size: ${print_batch_size} 52 | snapshot_path: ${snapshot_path} 53 | epochs_run: ${epochs_run} 54 | epochs: ${epochs} 55 | strategy: ${strategy} 56 | measure_epoch_time: False 57 | measure_gpu_data: False 58 | enable_torch_profiling: False 59 | logger: 60 | _target_: itwinai.loggers.LoggersCollection 61 | loggers: 62 | - _target_: itwinai.loggers.ConsoleLogger 63 | log_freq: 1 64 | - _target_: itwinai.loggers.MLFlowLogger 65 | experiment_name: Normalizing flows (ETHZ/CSIC) 66 | log_freq: batch 67 | -------------------------------------------------------------------------------- /use-cases/eurac/data.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Tuple 2 | import xarray as xr 3 | from itwinai.components import DataSplitter, monitor_exec 4 | 5 | from hython.scaler import Scaler 6 | from hython.datasets import get_dataset 7 | from hython.datasets.wflow_sbm import WflowSBM 8 | from hython.config import Config 9 | 10 | 11 | class RNNDatasetGetterAndPreprocessor(DataSplitter): 12 | def __init__( 13 | self, 14 | # == common == 15 | hython_trainer: str, 16 | dataset: str, 17 | data_lazy_load: bool, 18 | scaling_variant: str, 19 | scaling_use_cached: bool, 20 | experiment_name: str, 21 | experiment_run: str, 22 | data_source: dict, 23 | work_dir: str, 24 | dynamic_inputs: List[str] | None = None, 25 | static_inputs: List[str] | None = None, 26 | target_variables: List[str] | None = None, 27 | scaling_static_range: Dict | None = None, 28 | mask_variables: List[str] | None = None, 29 | static_inputs_mask: List[str] | None = None, 30 | head_model_inputs: List[str] | None = None, 31 | train_temporal_range: List[str] = None, 32 | valid_temporal_range: List[str] = None, 33 | train_downsampler: Dict | None = None, 34 | valid_downsampler: Dict | None = None, 35 | downsampling_temporal_dynamic: bool | None = None, 36 | min_sample_target: int | None = None, 37 | seq_length: int | None = None 38 | ) -> None: 39 | self.save_parameters(**self.locals2params(locals())) 40 | 41 | @monitor_exec 42 | def execute(self) -> Tuple[WflowSBM, WflowSBM, None]: 43 | cfg = Config() 44 | 45 | for i in self.parameters: 46 | setattr(cfg, i, self.parameters[i]) 47 | 48 | scaler = Scaler(cfg, cfg.scaling_use_cached) 49 | 50 | train_dataset = get_dataset(cfg.dataset)(cfg, scaler, True, "train") 51 | 52 | val_dataset = get_dataset(cfg.dataset)(cfg, scaler, False, "valid") 53 | 54 | return train_dataset, val_dataset, None 55 | -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-kubeflow-1/cpu.yaml: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | apiVersion: "kubeflow.org/v1" 11 | kind: PyTorchJob 12 | metadata: 13 | name: torchrun-cpu 14 | spec: 15 | # This property assumes that each pod runs on a separate node, 16 | # and is propagated to torchrun as its --nproc-per-node argument 17 | nprocPerNode: "2" 18 | pytorchReplicaSpecs: 19 | Master: 20 | # Usually only one Master pod is used 21 | replicas: 1 22 | restartPolicy: OnFailure 23 | template: 24 | spec: 25 | containers: 26 | - name: pytorch 27 | image: registry.cern.ch/itwinai/dist-ml/itwinai-slim:0.0.10 28 | command: 29 | - "torchrun" 30 | - "/app/train-cpu.py" 31 | - "--force-dist" 32 | resources: 33 | # Requests help to implicitly make sure that each pod is running 34 | # in a separate node. 35 | requests: 36 | cpu: 1500m 37 | limits: 38 | cpu: 1500m 39 | memory: 2500Mi 40 | Worker: 41 | # The number of worker pods 42 | replicas: 1 43 | restartPolicy: OnFailure 44 | template: 45 | spec: 46 | containers: 47 | - name: pytorch 48 | image: registry.cern.ch/itwinai/dist-ml/itwinai-slim:0.0.10 49 | command: 50 | - "torchrun" 51 | - "/app/train-cpu.py" 52 | - "--force-dist" 53 | resources: 54 | requests: 55 | cpu: 1500m 56 | limits: 57 | cpu: 1500m 58 | memory: 2500Mi 59 | -------------------------------------------------------------------------------- /tutorials/distributed-ml/tf-tutorial-0-basics/tfmirrored_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # general configuration of the job 4 | #SBATCH --job-name=TFTest 5 | #SBATCH --account=intertwin 6 | #SBATCH --mail-user= 7 | #SBATCH --mail-type=ALL 8 | #SBATCH --output=job.out 9 | #SBATCH --error=job.err 10 | #SBATCH --time=00:15:00 11 | 12 | # configure node and process count on the CM 13 | #SBATCH --partition=batch 14 | #SBATCH --nodes=2 15 | #SBATCH --ntasks-per-node=1 16 | #SBATCH --cpus-per-task=32 17 | #SBATCH --gpus-per-node=4 18 | #SBATCH --exclusive 19 | 20 | # gres options have to be disabled for deepv 21 | #SBATCH --gres=gpu:4 22 | 23 | set -x 24 | unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY 25 | 26 | # set modules 27 | ml --force purge 28 | ml Stages/2024 GCC/12.3.0 OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py CMake cuDNN/8.9.5.29-CUDA-12 29 | 30 | # set env - change to location of your environment 31 | source itwinai/envAItf_hdfml/bin/activate 32 | 33 | # Using legacy (2.16) version of Keras 34 | # Latest version with TF (2.16) installs Keras 3.3 35 | # which returns an error for multi-node execution 36 | export TF_USE_LEGACY_KERAS=1 37 | 38 | # sleep a sec 39 | sleep 1 40 | 41 | # job info 42 | echo "DEBUG: TIME: $(date)" 43 | echo "DEBUG: EXECUTE: $EXEC" 44 | echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" 45 | echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" 46 | echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" 47 | echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" 48 | echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" 49 | echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" 50 | echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" 51 | echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" 52 | echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" 53 | echo "DEBUG: SLURM_NODELIST: $SLURM_NODELIST" 54 | echo 55 | 56 | # set comm 57 | export CUDA_VISIBLE_DEVICES="0,1,2,3" 58 | export OMP_NUM_THREADS=1 59 | if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then 60 | export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK 61 | fi 62 | 63 | COMMAND="train.py" 64 | 65 | EXEC="$COMMAND " 66 | 67 | srun python -u $EXEC 68 | -------------------------------------------------------------------------------- /use-cases/3dgan/slurm.vega.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # SLURM jobscript for Vega systems 4 | 5 | # Job configuration 6 | #SBATCH --job-name=3dgan_training 7 | #SBATCH --account=s24r05-03-users 8 | #SBATCH --mail-user= 9 | #SBATCH --mail-type=ALL 10 | #SBATCH --output=job.out 11 | #SBATCH --error=job.err 12 | #SBATCH --time=01:00:00 13 | 14 | # Resources allocation 15 | #SBATCH --partition=gpu 16 | #SBATCH --nodes=2 17 | #SBATCH --gpus-per-node=4 18 | #SBATCH --cpus-per-gpu=4 19 | #SBATCH --ntasks-per-node=1 20 | # SBATCH --mem-per-gpu=10G 21 | #SBATCH --exclusive 22 | 23 | # gres options have to be disabled for deepv 24 | #SBATCH --gres=gpu:4 25 | 26 | echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" 27 | echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" 28 | echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" 29 | echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" 30 | echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" 31 | echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" 32 | echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" 33 | echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" 34 | echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" 35 | 36 | # ml --force purge 37 | # ml Python CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/11.7 38 | # ml GCCcore/11.3.0 NCCL/2.12.12-GCCcore-11.3.0-CUDA-11.7.0 cuDNN 39 | 40 | ml Python 41 | module unload OpenSSL 42 | 43 | source ~/.bashrc 44 | 45 | # Activate the environment 46 | source ../../.venv-pytorch/bin/activate 47 | 48 | GAN_DATASET="exp_data" #"/ceph/hpc/data/st2301-itwin-users/egarciagarcia" 49 | 50 | # launch training 51 | TRAINING_CMD="$(which itwinai) exec-pipeline num_nodes=$SLURM_NNODES \ 52 | dataset_location=$GAN_DATASET " 53 | 54 | srun --cpu-bind=none --ntasks-per-node=1 \ 55 | bash -c "torchrun \ 56 | --log_dir='logs_torchrun' \ 57 | --nnodes=$SLURM_NNODES \ 58 | --nproc_per_node=$SLURM_GPUS_PER_NODE \ 59 | --rdzv_id=$SLURM_JOB_ID \ 60 | --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ 61 | --rdzv_backend=c10d \ 62 | --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)':29500 \ 63 | $TRAINING_CMD " -------------------------------------------------------------------------------- /tests/components/conftest.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | import pytest 11 | 12 | pytest.PIPE_LIST_YAML = """ 13 | my-list-pipeline: 14 | _target_: itwinai.pipeline.Pipeline 15 | steps: 16 | - _target_: itwinai.tests.dummy_components.FakePreproc 17 | max_items: 33 18 | name: my-preproc 19 | 20 | - _target_: itwinai.tests.dummy_components.FakeTrainer 21 | lr: 0.001 22 | batch_size: 32 23 | name: my-trainer 24 | """ 25 | 26 | pytest.PIPE_DICT_YAML = """ 27 | my-dict-pipeline: 28 | _target_: itwinai.pipeline.Pipeline 29 | steps: 30 | preproc-step: 31 | _target_: itwinai.tests.dummy_components.FakePreproc 32 | max_items: 33 33 | name: my-preproc 34 | 35 | train-step: 36 | _target_: itwinai.tests.dummy_components.FakeTrainer 37 | lr: 0.001 38 | batch_size: 32 39 | name: my-trainer 40 | """ 41 | 42 | pytest.NESTED_PIPELINE = """ 43 | some: 44 | field: 45 | my-nested-pipeline: 46 | _target_: itwinai.pipeline.Pipeline 47 | steps: 48 | - _target_: itwinai.tests.dummy_components.FakePreproc 49 | max_items: 33 50 | name: my-preproc 51 | 52 | - _target_: itwinai.tests.dummy_components.FakeTrainer 53 | lr: 0.001 54 | batch_size: 32 55 | name: my-trainer 56 | """ 57 | 58 | pytest.INTERPOLATED_VALUES_PIPELINE = """ 59 | max_items: 33 60 | name: my-trainer 61 | lr: 0.001 62 | my-interpolation-pipeline: 63 | _target_: itwinai.pipeline.Pipeline 64 | steps: 65 | - _target_: itwinai.tests.dummy_components.FakePreproc 66 | max_items: ${max_items} 67 | name: my-preproc 68 | 69 | - _target_: itwinai.tests.dummy_components.FakeTrainer 70 | lr: ${lr} 71 | batch_size: 32 72 | name: ${name} 73 | """ 74 | -------------------------------------------------------------------------------- /docs/installation/software_prerequisites.rst: -------------------------------------------------------------------------------- 1 | Setting up the system dependencies 2 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 3 | First of all, before installing itwinai and its Python dependencies let's make sure that the 4 | system dependencies such as CUDA drivers, compilers, and MPI libraries, are correctly set up. 5 | 6 | Supported OSs are Linux and macOS. 7 | 8 | .. warning:: 9 | 10 | On high-performance computing (HPC) systems, **you must load the appropriate modules 11 | before creating or activating your Python virtual environment** to ensure compatibility with 12 | system libraries. 13 | 14 | .. include:: ./hpc_modules.rst 15 | 16 | 17 | Creating a Python Virtual Environment 18 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 19 | The suggested way of managing Python dependencies, including itwinai, is through Python virtual 20 | environments. Creating a virtual environment is allows to isolate dependencies and prevent 21 | conflicts with other Python projects. 22 | 23 | Beware that some HPC centers advise against using Python virtual environments as they create a 24 | large amount of files, which can clog some distributed filesystems. In such situation, you 25 | should prefer using containers. 26 | 27 | To manage python virtual environments we use UV, which can be installed from 28 | `this page `_. Learn more on UV 29 | package manager from our `UV tutorial `_ 30 | 31 | If you don't already have a virtual environment, you can create one with the following 32 | command: 33 | 34 | .. code-block:: bash 35 | 36 | # Remember to load the software modules first (see section above)! 37 | 38 | uv venv 39 | 40 | # Alternatively to the command above, if you just want to use plain pip instead of UV 41 | python -m venv .venv 42 | 43 | Notice that a new directory called ``.venv`` is created to contain your virtual 44 | environment. Now, you can start your virtual environment with the following command: 45 | 46 | .. code-block:: bash 47 | 48 | # Remember to load the software modules first (see section above)! 49 | 50 | source .venv/bin/activate 51 | -------------------------------------------------------------------------------- /.github/workflows/pytest.yml: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | name: Testing with pytest 11 | 12 | on: 13 | pull_request: 14 | branches: [main] 15 | 16 | jobs: 17 | test-torch: 18 | name: Testing with pytest 19 | runs-on: ubuntu-latest 20 | steps: 21 | 22 | # Uncomment this only if you run out of disk space! 23 | # - name: Maximize Disk Space 24 | # uses: easimon/maximize-build-space@v10 25 | # with: 26 | # # Reserve space on root for docker/dagger cache 27 | # build-mount-path: /docker 28 | # root-reserve-mb: 2048 29 | # overprovision-lvm: false 30 | # swap-size-mb: 4096 31 | # remove-dotnet: true 32 | # remove-android: true 33 | # remove-haskell: true 34 | # remove-codeql: true 35 | 36 | - uses: actions/checkout@v6 37 | 38 | # ALSO uncomment this only if you run out of disk space! 39 | # - name: Move Docker directory 40 | # shell: bash 41 | # run: | 42 | # sudo mv /var/lib/docker /docker/ && 43 | # sudo ln -s /docker/docker /var/lib/docker && 44 | # sudo systemctl restart docker 45 | 46 | # Run tests with pytest in a container 47 | - name: Run Integration Test (development pipeline) 48 | uses: dagger/dagger-for-github@v7 49 | with: 50 | workdir: ci 51 | verb: call 52 | args: >- 53 | build-container 54 | --context .. 55 | --dockerfile ../env-files/torch/skinny.Dockerfile 56 | test-local 57 | --cmd "pytest,-v,--disable-warnings,-n,logical,/app/tests/,--dist,loadfile,-m,not hpc and not tensorflow" 58 | logs 59 | cloud-token: ${{ secrets.DAGGER_CLOUD_TOKEN }} 60 | version: latest 61 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | **/TODO 2 | **/mamba* 3 | pl-training.yml 4 | .vscode 5 | 6 | # Project folders/files 7 | # use-cases 8 | workflows 9 | CHANGELOG 10 | 11 | # Docs 12 | docs 13 | 14 | # interLink pods 15 | **/interLink 16 | **/interlink 17 | 18 | # Data 19 | **/MNIST 20 | **/*-predictions/ 21 | **/*-data/ 22 | **/*.tar.gz 23 | **/exp_data 24 | 25 | # Logs 26 | **/logs 27 | **/lightning_logs 28 | **/mlruns 29 | **/.logs 30 | **/mllogs 31 | **/nohup* 32 | **/*.out 33 | **/*.err 34 | **/checkpoints/ 35 | **/*_logs 36 | **/tmp* 37 | **/.tmp* 38 | 39 | # Markdown 40 | **/*.md 41 | 42 | # Custom envs 43 | **/.venv* 44 | 45 | # Git 46 | .git 47 | .gitignore 48 | .github 49 | 50 | # CI 51 | .codeclimate.yml 52 | .travis.yml 53 | .taskcluster.yml 54 | 55 | # Docker 56 | docker-compose.yml 57 | .docker 58 | .dockerignore 59 | Dockerfile 60 | 61 | # Byte-compiled / optimized / DLL files 62 | **/__pycache__/ 63 | **/*.py[cod] 64 | 65 | # C extensions 66 | *.so 67 | 68 | # Distribution / packaging 69 | .Python 70 | env/ 71 | build/ 72 | develop-eggs/ 73 | dist/ 74 | downloads/ 75 | **/eggs/ 76 | lib/ 77 | lib64/ 78 | parts/ 79 | sdist/ 80 | var/ 81 | **/*.egg-info/ 82 | **/.installed.cfg 83 | **/*.egg 84 | 85 | # PyInstaller 86 | # Usually these files are written by a python script from a template 87 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 88 | *.manifest 89 | *.spec 90 | 91 | # Installer logs 92 | pip-log.txt 93 | pip-delete-this-directory.txt 94 | 95 | # Unit test / coverage reports 96 | htmlcov/ 97 | .tox/ 98 | .coverage 99 | .cache 100 | nosetests.xml 101 | coverage.xml 102 | 103 | # Translations 104 | *.mo 105 | *.pot 106 | 107 | # Django stuff: 108 | *.log 109 | 110 | # Sphinx documentation 111 | docs/_build/ 112 | 113 | # PyBuilder 114 | target/ 115 | 116 | # Virtual environment 117 | .env/ 118 | .venv/ 119 | venv/ 120 | 121 | # PyCharm 122 | .idea 123 | 124 | # Python mode for VIM 125 | .ropeproject 126 | */.ropeproject 127 | */*/.ropeproject 128 | */*/*/.ropeproject 129 | 130 | # Vim swap files 131 | *.swp 132 | */*.swp 133 | */*/*.swp 134 | */*/*/*.swp -------------------------------------------------------------------------------- /tutorials/distributed-ml/tf-scaling-test-jube/jube_ddp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # general configuration of the job 4 | #SBATCH --job-name=JUBE_DDP 5 | #SBATCH --account=#ACC# 6 | #SBATCH --mail-user= 7 | #SBATCH --mail-type=ALL 8 | #SBATCH --output=job.out 9 | #SBATCH --error=job.err 10 | #SBATCH --time=#TIMELIM# 11 | 12 | # configure node and process count on the CM 13 | #SBATCH --partition=#QUEUE# 14 | #SBATCH --nodes=#NODES# 15 | #SBATCH --cpus-per-task=#NW# 16 | #SBATCH --gpus-per-node=#NGPU# 17 | #SBATCH --exclusive 18 | 19 | # gres options have to be disabled for deepv 20 | #SBATCH --gres=gpu:4 21 | 22 | set -x 23 | unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY 24 | 25 | # set modules 26 | ml --force purge 27 | ml Stages/2024 GCC/12.3.0 OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py CMake cuDNN/8.9.5.29-CUDA-12 28 | 29 | # set env 30 | source /p/project/intertwin/rakesh/repo_push/itwinai/envAItf_hdfml/bin/activate 31 | 32 | # Using legacy (2.16) version of Keras 33 | # Latest version with TF (2.16) installs Keras 3.3 34 | # which returns an error for multi-node execution 35 | export TF_USE_LEGACY_KERAS=1 36 | 37 | # sleep a sec 38 | sleep 1 39 | 40 | # job info 41 | echo "DEBUG: TIME: $(date)" 42 | echo "DEBUG: EXECUTE: $EXEC" 43 | echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" 44 | echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" 45 | echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" 46 | echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" 47 | echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" 48 | echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" 49 | echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" 50 | echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" 51 | echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" 52 | echo "DEBUG: SLURM_NODELIST: $SLURM_NODELIST" 53 | echo 54 | 55 | # set comm 56 | export CUDA_VISIBLE_DEVICES="0,1,2,3" 57 | export OMP_NUM_THREADS=1 58 | if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then 59 | export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK 60 | fi 61 | 62 | dataDir='/p/scratch/intertwin/datasets/imagenet/' 63 | 64 | COMMAND="train.py" 65 | 66 | EXEC="$COMMAND \ 67 | --data_dir $dataDir" 68 | 69 | srun python -u $EXEC 70 | 71 | 72 | #eof 73 | -------------------------------------------------------------------------------- /tutorials/distributed-ml/tf-tutorial-1-imagenet/tfmirrored_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # general configuration of the job 4 | #SBATCH --job-name=TFTest 5 | #SBATCH --account=intertwin 6 | #SBATCH --mail-user= 7 | #SBATCH --mail-type=ALL 8 | #SBATCH --output=job.out 9 | #SBATCH --error=job.err 10 | #SBATCH --time=01:00:00 11 | 12 | # configure node and process count on the CM 13 | #SBATCH --partition=batch 14 | #SBATCH --nodes=4 15 | #SBATCH --ntasks-per-node=1 16 | #SBATCH --cpus-per-task=32 17 | #SBATCH --gpus-per-node=4 18 | #SBATCH --exclusive 19 | 20 | # gres options have to be disabled for deepv 21 | #SBATCH --gres=gpu:4 22 | 23 | set -x 24 | unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY 25 | 26 | # set modules 27 | ml --force purge 28 | ml Stages/2024 GCC/12.3.0 OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py CMake cuDNN/8.9.5.29-CUDA-12 29 | 30 | # set env 31 | source /p/project/intertwin/rakesh/repo_push/itwinai/envAItf_hdfml/bin/activate 32 | 33 | # Using legacy (2.16) version of Keras 34 | # Latest version with TF (2.16) installs Keras 3.3 35 | # which returns an error for multi-node execution 36 | export TF_USE_LEGACY_KERAS=1 37 | 38 | # sleep a sec 39 | sleep 1 40 | 41 | # job info 42 | echo "DEBUG: TIME: $(date)" 43 | echo "DEBUG: EXECUTE: $EXEC" 44 | echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" 45 | echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" 46 | echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" 47 | echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" 48 | echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" 49 | echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" 50 | echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" 51 | echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" 52 | echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" 53 | echo "DEBUG: SLURM_NODELIST: $SLURM_NODELIST" 54 | echo 55 | 56 | # set comm 57 | export CUDA_VISIBLE_DEVICES="0,1,2,3" 58 | export OMP_NUM_THREADS=1 59 | if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then 60 | export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK 61 | fi 62 | 63 | dataDir='/p/scratch/intertwin/datasets/imagenet/' 64 | 65 | COMMAND="train.py" 66 | 67 | EXEC="$COMMAND \ 68 | --data_dir $dataDir" 69 | 70 | srun python -u $EXEC 71 | -------------------------------------------------------------------------------- /docs/tutorials/tutorials.rst: -------------------------------------------------------------------------------- 1 | .. _tutorials: 2 | 3 | 4 | 5 | .. _distributed-training-tutorials: 6 | 7 | Distributed machine learning training 8 | ====================================== 9 | 10 | Here you can find a collection of tutorials for distributing PyTorch and Tensorflow based workflows. 11 | 12 | 13 | Distributed ML with PyTorch 14 | --------------------------- 15 | 16 | .. toctree:: 17 | :maxdepth: 1 18 | :numbered: 19 | 20 | distrib-ml/torch_tutorial_0_basics 21 | distrib-ml/torch_tutorial_1_mnist 22 | distrib-ml/torch_tutorial_2_trainer_class 23 | distrib-ml/torch-tutorial-GAN 24 | distrib-ml/torch_scaling_test 25 | distrib-ml/torch-tutorial-containers 26 | distrib-ml/torch_tutorial_kubeflow_1.rst 27 | distrib-ml/kuberay-setup-tutorial.rst 28 | 29 | 30 | Distributed ML with TensorFlow 31 | ------------------------------ 32 | 33 | .. toctree:: 34 | :maxdepth: 1 35 | :numbered: 36 | 37 | distrib-ml/tf_tutorial_0_basics 38 | distrib-ml/tf_tutorial_1_imagenet 39 | distrib-ml/tf_scaling_test 40 | 41 | 42 | .. _ml-workflows-tutorials: 43 | 44 | Machine Learning Workflows 45 | =========================== 46 | 47 | Here you can find a collection of tutorials for various complexity ML workflows. 48 | 49 | .. toctree:: 50 | :maxdepth: 1 51 | 52 | workflows/01-pipeline-introduction/tutorial_0_basic_workflow 53 | workflows/02-pipeline-configuration/tutorial_1_intermediate_workflow 54 | workflows/03-dag-workflows/tutorial_2_advanced_workflow 55 | workflows/04_itwinai_argparser 56 | 57 | 58 | .. _hpo-tutorials: 59 | 60 | Hyperparameter Optimization 61 | =========================== 62 | 63 | This tutorial provides an overview of Hyperparameter Optimization (HPO) workflows. 64 | 65 | .. toctree:: 66 | :maxdepth: 1 67 | 68 | hpo-workflows/hpo-torchtrainer-integration 69 | 70 | 71 | .. _profiling-tutorials: 72 | 73 | Code Profiling and Optimization 74 | =============================== 75 | 76 | Here you can find our tutorials on how to do profiling with **itwinai**: 77 | 78 | .. toctree:: 79 | :maxdepth: 1 80 | 81 | profiling/profiling-overview 82 | profiling/py-spy-profiling 83 | profiling/py-spy-lattice-qcd-example 84 | -------------------------------------------------------------------------------- /use-cases/cyclones/startscript.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # general configuration of the job 4 | #SBATCH --job-name=cyclones 5 | #SBATCH --account=intertwin 6 | #SBATCH --mail-user= 7 | #SBATCH --mail-type=ALL 8 | #SBATCH --output=job.out 9 | #SBATCH --error=job.err 10 | #SBATCH --time=00:30:00 11 | 12 | # configure node and process count on the CM 13 | #SBATCH --partition=batch 14 | #SBATCH --nodes=2 15 | #SBATCH --ntasks-per-node=1 16 | #SBATCH --cpus-per-task=4 17 | #SBATCH --gpus-per-node=4 18 | 19 | # SBATCH --exclusive 20 | 21 | # gres options have to be disabled for deepv 22 | #SBATCH --gres=gpu:4 23 | 24 | set -x 25 | unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY 26 | 27 | # load modules 28 | ml --force purge 29 | ml Stages/2024 GCC/12.3.0 OpenMPI CUDA/12 MPI-settings/CUDA Python/3.11 HDF5 PnetCDF libaio mpi4py CMake cuDNN/8.9.5.29-CUDA-12 30 | 31 | source ../../envAItf_hdfml/bin/activate 32 | 33 | # job info 34 | echo "DEBUG: TIME: $(date)" 35 | echo "DEBUG: EXECUTE: $EXEC" 36 | echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" 37 | echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" 38 | echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" 39 | echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" 40 | echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" 41 | echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" 42 | echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" 43 | echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" 44 | echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" 45 | echo "DEBUG: SLURM_NODELIST: $SLURM_NODELIST" 46 | echo 47 | 48 | # ONLY IF TENSORFLOW >= 2.16: 49 | # # Using legacy (2.16) version of Keras 50 | # # Latest version with TF (2.16) installs Keras 3.3 51 | # # which returns an error for multi-node execution 52 | # export TF_USE_LEGACY_KERAS=1 53 | 54 | # set comm 55 | export CUDA_VISIBLE_DEVICES="0,1,2,3" 56 | export OMP_NUM_THREADS=1 57 | if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then 58 | export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK 59 | fi 60 | 61 | # ON LOGIN NODE download datasets: 62 | # ../../.venv-tf/bin/python train.py -p pipeline.yaml --download-only 63 | 64 | # --data_path argument is optional, but on JSC we use the dataset we previously downloaded 65 | srun python train.py -p pipeline.yaml --data_path /p/project/intertwin/smalldata/cmcc -------------------------------------------------------------------------------- /tutorials/hpo-workflows/fashion-mnist/config.yaml: -------------------------------------------------------------------------------- 1 | hpo_training_pipeline: 2 | _target_: itwinai.pipeline.Pipeline 3 | steps: 4 | - _target_: data.FashionMNISTGetter 5 | - _target_: data.FashionMNISTSplitter 6 | train_proportion: 0.9 7 | validation_proportion: 0.1 8 | test_proportion: 0.0 9 | - _target_: trainer.FashionMNISTTrainer 10 | 11 | # In this case we have noting to pass to the TrainingConfiguration. Some of its fields 12 | # will be overridden using the hyperparameters sampled from the search space by the tuner 13 | config: null 14 | 15 | epochs: 2 16 | 17 | # For more info: https://docs.ray.io/en/latest/train/api/doc/ray.train.ScalingConfig.html 18 | ray_scaling_config: 19 | _target_: ray.train.ScalingConfig 20 | num_workers: 1 21 | use_gpu: true 22 | resources_per_worker: 23 | CPU: 8 24 | GPU: 1 25 | 26 | # For more info: https://docs.ray.io/en/latest/tune/api/doc/ray.tune.TuneConfig.html 27 | ray_tune_config: 28 | _target_: ray.tune.TuneConfig 29 | num_samples: 2 30 | scheduler: 31 | _target_: ray.tune.schedulers.ASHAScheduler 32 | metric: loss # name of the metric to optimize during HPO 33 | mode: min 34 | max_t: 10 35 | grace_period: 5 36 | reduction_factor: 4 37 | brackets: 1 38 | 39 | # For more info: https://docs.ray.io/en/latest/tune/api/doc/ray.tune.RunConfig.html 40 | ray_run_config: 41 | _target_: ray.tune.RunConfig 42 | storage_path: ${itwinai.cwd:}/ray_checkpoints 43 | name: FashionMNIST-HPO-Experiment 44 | 45 | # For more info: https://docs.ray.io/en/latest/tune/api/search_space.html 46 | ray_search_space: 47 | batch_size: 48 | type: choice 49 | categories: [32, 64, 128] 50 | learning_rate: 51 | type: uniform 52 | lower: 1e-5 53 | upper: 1e-3 54 | 55 | strategy: ddp 56 | logger: 57 | _target_: itwinai.loggers.LoggersCollection 58 | loggers: 59 | - _target_: itwinai.loggers.MLFlowLogger 60 | experiment_name: FashionMNIST HPO Experiment 61 | log_freq: batch 62 | -------------------------------------------------------------------------------- /src/itwinai/slurm/slurm_script_configuration.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from pydantic import BaseModel 4 | 5 | from itwinai.slurm.slurm_constants import SLURM_TEMPLATE 6 | 7 | 8 | class SlurmScriptConfiguration(BaseModel): 9 | """Configuration object for the SLURM script. It contains all the settings for the 10 | SLURM script such as which hardware you are requesting or for how long to run it. 11 | As it allows for any ``pre_exec_command`` and ``exec_command``, it should work for 12 | any SLURM script. 13 | """ 14 | 15 | # Settings for the SLURM configuration 16 | job_name: str | None = None 17 | account: str 18 | partition: str 19 | time: str 20 | 21 | std_out: Path | None = None 22 | err_out: Path | None = None 23 | 24 | num_nodes: int 25 | num_tasks_per_node: int 26 | gpus_per_node: int 27 | cpus_per_task: int 28 | memory: str 29 | exclusive: bool = False 30 | 31 | # Typically used to set up the environment before executing the command, 32 | # e.g. "ml Python", "source .venv/bin/activate" etc. 33 | pre_exec_command: str | None = None 34 | 35 | # Command to execute, typically an 'srun' command 36 | exec_command: str | None = None 37 | 38 | def exclusive_line(self) -> str: 39 | return "#SBATCH --exclusive" if self.exclusive else "" 40 | 41 | def generate_script(self) -> str: 42 | """Uses the provided configuration parameters and formats a SLURM script with 43 | the requested settings. 44 | 45 | Returns: 46 | str: A string containing the SLURM script. 47 | """ 48 | if ( 49 | self.std_out is None 50 | or self.err_out is None 51 | or self.job_name is None 52 | or self.pre_exec_command is None 53 | or self.exec_command is None 54 | ): 55 | raise ValueError( 56 | "SlurmScriptConfiguration has some fields set to None! Make sure to set all" 57 | " fields before generating script! Configuration was formatted as follows:\n" 58 | f"{repr(self)}" 59 | ) 60 | 61 | return SLURM_TEMPLATE.format_map( 62 | self.model_dump() | {"exclusive_line": self.exclusive_line()} 63 | ) 64 | -------------------------------------------------------------------------------- /docs/installation/post_itwinai_installation.rst: -------------------------------------------------------------------------------- 1 | .. note:: 2 | If you want to use the Prov4ML logger, you need to install it explicitly since it is only 3 | available on GitHub: 4 | 5 | For systems with Nvidia GPUs: 6 | 7 | .. code-block:: bash 8 | 9 | uv pip install "prov4ml[nvidia]@git+https://github.com/matbun/ProvML@v0.0.2" 10 | 11 | For macOS: 12 | 13 | .. code-block:: bash 14 | 15 | uv pip install "prov4ml[apple]@git+https://github.com/matbun/ProvML@v0.0.2" 16 | 17 | 18 | Installing Horovod and Microsoft DeepSpeed 19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 20 | If you also want to install Horovod and Microsoft DeepSpeed for distributed ML with 21 | PyTorch, then make sure to install them **after** ``itwinai``. You can choose if you 22 | want to do this with or without GPU (CUDA) support: 23 | 24 | .. tab-set:: 25 | 26 | .. tab-item:: CPU 27 | 28 | .. code-block:: bash 29 | 30 | uv pip install --no-cache-dir --no-build-isolation git+https://github.com/horovod/horovod.git@3a31d93 31 | uv pip install --no-cache-dir --no-build-isolation deepspeed==0.16.8 32 | 33 | 34 | .. tab-item:: CUDA 35 | 36 | .. code-block:: bash 37 | 38 | curl -fsSL https://github.com/interTwin-eu/itwinai/raw/main/env-files/torch/install-horovod-deepspeed-cuda.sh | bash 39 | 40 | 41 | .. warning:: 42 | 43 | Horovod requires ``CMake>=3.13`` and 44 | `other packages `_ 45 | Make sure to have them installed in your environment before proceeding. 46 | 47 | 48 | .. warning:: 49 | The installation of Horovod and DeepSpeed needs to be executed on a machine/node where GPUs 50 | are available. On some HPC systems, such as the `JUWELS `_ 51 | system on JSC, GPUs **are not available on login nodes** (the host you connect to when you 52 | SSH into the system), only on **compute nodes**. On the JUWELS system, run this command to 53 | install DeepSpeed and Horovod directly **from the repository's root**: 54 | 55 | .. code-block:: bash 56 | 57 | curl -fsSL https://github.com/interTwin-eu/itwinai/raw/main/env-files/torch/horovod-deepspeed-JSC.slurm | sbatch 58 | 59 | -------------------------------------------------------------------------------- /env-files/tensorflow/createEnvJSCTF.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # -*- coding: utf-8 -*- 3 | 4 | if [ ! -f "env-files/tensorflow/generic_tf.sh" ]; then 5 | echo "ERROR: env-tensorflow/torch/generic_tf.sh not found!" 6 | exit 1 7 | fi 8 | 9 | # set modules 10 | ml --force purge 11 | 12 | # get sys info 13 | cDir=$PWD 14 | sysN="$(uname -n | cut -f2- -d.)" 15 | echo "system:${sysN}" 16 | echo 17 | 18 | cont1=false 19 | if [ "$sysN" = 'hdfml' ] ; then 20 | # NOTE: REFLECT THEM IN THE MAIN README! 21 | ml Stages/2024 GCC/12.3.0 OpenMPI CUDA/12 MPI-settings/CUDA Python/3.11 HDF5 PnetCDF libaio mpi4py CMake cuDNN/8.9.5.29-CUDA-12 22 | cont1=true 23 | else 24 | echo 25 | echo 'unknown system detected' 26 | echo 'canceling' 27 | echo 28 | fi 29 | echo "modules loaded" 30 | echo 31 | 32 | # get python version 33 | pver="$(python --version 2>&1 | awk '{print $2}' | cut -f1-2 -d.)" 34 | echo "python version is ${pver}" 35 | echo 36 | 37 | if [ "$cont1" = true ] ; then 38 | if [ -d "${cDir}/envAItf_${sysN}" ];then 39 | echo 'env already exist' 40 | echo 41 | 42 | source envAItf_${sysN}/bin/activate 43 | else 44 | # create env 45 | python -m venv envAItf_${sysN} 46 | 47 | # get headers for pip 48 | if [ -f "${cDir}/envAItf_${sysN}/bin/pip" ]; then 49 | echo 'pip already exist' 50 | else 51 | cp "$(which pip)" $cDir/envAItf_${sysN}/bin/ 52 | ln -s $cDir/envAItf_${sysN}/bin/pip $cDir/envAItf_${sysN}/bin/pip${pver} 53 | var="#!$cDir/envAItf_${sysN}/bin/python${pver}" 54 | sed -i "1s|.*|$var|" $cDir/envAItf_${sysN}/bin/pip 55 | fi 56 | 57 | # activate env 58 | source envAItf_${sysN}/bin/activate 59 | 60 | echo "a new env is created in ${cDir}" 61 | echo "activation is done via:" 62 | echo "source ${cDir}/envAItf_${sysN}/bin/activate" 63 | fi 64 | fi 65 | 66 | # Install TF dependencies in env 67 | export ENV_NAME="envAItf_$sysN" 68 | bash env-files/tensorflow/generic_tf.sh 69 | source $ENV_NAME/bin/activate 70 | 71 | # JUBE benchmarking environment 72 | if [ -f "${cDir}/envAI_${sysN}/bin/jube" ]; then 73 | echo 'JUBE already installed' 74 | else 75 | pip install --no-cache-dir http://apps.fz-juelich.de/jsc/jube/jube2/download.php?version=latest 76 | fi 77 | 78 | # # get rest of the libraries$ 79 | # if [ "$cont1" = true ] ; then 80 | # pip install -r reqs_TF.txt #--ignore-installed 81 | # fi 82 | 83 | -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-tutorial-2-trainer-class/sample_code.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Jarl Sondre Sæther 5 | # 6 | # Credit: 7 | # - Jarl Sondre Sæther - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | """This file contains the sample code that was used for the snippets in the interTwin 11 | presentation held on Feb. 18. These code snippets are meant as outlines for how to use 12 | itwinai to simplify distributed ML. 13 | """ 14 | 15 | from itwinai.torch.distributed import TorchDDPStrategy 16 | from itwinai.torch.trainer import TorchTrainer 17 | 18 | 19 | # Included for the sake of linting 20 | def train(model): 21 | pass 22 | 23 | 24 | ############################################################################## 25 | # Using itwinai's Strategy but not the TorchTrainer 26 | ############################################################################## 27 | 28 | # Create and initialize strategy 29 | strategy = TorchDDPStrategy(backend="nccl") 30 | strategy.init() 31 | 32 | # Create dataset as usual 33 | train_dataset = ... 34 | 35 | # Use 'strategy' to create dataloader 36 | train_dataloader = strategy.create_dataloader(train_dataset, ...) 37 | 38 | # Create model, optimizer and scheduler as usual 39 | model, optimizer, scheduler = ... 40 | 41 | # Distribute them using 'strategy' 42 | model, optimizer, scheduler = strategy.distributed(model, optimizer, scheduler) 43 | 44 | # Train model as usual 45 | train(model) # Note: have to notify 'strategy' every time an epoch passes 46 | 47 | # Clean up strategy at the end 48 | strategy.clean_up() 49 | ############################################################################## 50 | 51 | 52 | ############################################################################## 53 | # Using itwinai's TorchTrainer (which uses Strategy internally) 54 | ############################################################################## 55 | 56 | # Create dataset as usual 57 | train_dataset = ... 58 | 59 | # Create model as usual 60 | model = ... 61 | 62 | trainer = TorchTrainer(config={}, model=model, strategy="ddp") 63 | 64 | _, _, _, trained_model = trainer.execute(train_dataset, ...) 65 | ############################################################################## 66 | -------------------------------------------------------------------------------- /use-cases/mnist/torch/saver.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | """This module is used during inference to save predicted labels to file.""" 11 | 12 | import csv 13 | import os 14 | import shutil 15 | from typing import Dict, List, Optional 16 | 17 | from itwinai.components import Saver, monitor_exec 18 | 19 | 20 | class TorchMNISTLabelSaver(Saver): 21 | """Serializes to disk the labels predicted for MNIST dataset.""" 22 | 23 | def __init__( 24 | self, 25 | save_dir: str = "mnist_predictions", 26 | predictions_file: str = "predictions.csv", 27 | class_labels: Optional[List] = None, 28 | ) -> None: 29 | super().__init__() 30 | self.save_parameters(**self.locals2params(locals())) 31 | self.save_dir = save_dir 32 | self.predictions_file = predictions_file 33 | self.class_labels = ( 34 | class_labels if class_labels is not None else [f"Digit {i}" for i in range(10)] 35 | ) 36 | 37 | @monitor_exec 38 | def execute( 39 | self, 40 | predicted_classes: Dict[str, int], 41 | ) -> Dict[str, int]: 42 | """Translate predictions from class idx to class label and save 43 | them to disk. 44 | 45 | Args: 46 | predicted_classes (Dict[str, int]): maps unique item ID to 47 | the predicted class ID. 48 | 49 | Returns: 50 | Dict[str, int]: predicted classes. 51 | """ 52 | if os.path.exists(self.save_dir): 53 | shutil.rmtree(self.save_dir) 54 | os.makedirs(self.save_dir) 55 | 56 | # Map class idx (int) to class label (str) 57 | predicted_labels = { 58 | itm_name: self.class_labels[cls_idx] 59 | for itm_name, cls_idx in predicted_classes.items() 60 | } 61 | 62 | # Save to disk 63 | filepath = os.path.join(self.save_dir, self.predictions_file) 64 | with open(filepath, "w") as csv_file: 65 | writer = csv.writer(csv_file) 66 | for key, value in predicted_labels.items(): 67 | writer.writerow([key, value]) 68 | return predicted_labels 69 | -------------------------------------------------------------------------------- /tutorials/distributed-ml/torch-tutorial-containers/config.yaml: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Part of the interTwin Project: https://www.intertwin.eu/ 3 | # 4 | # Created by: Matteo Bunino 5 | # 6 | # Credit: 7 | # - Matteo Bunino - CERN 8 | # -------------------------------------------------------------------------------------- 9 | 10 | # General config 11 | dataset_root: .tmp/ 12 | num_classes: 10 13 | batch_size: 64 14 | num_workers_dataloader: 4 15 | pin_memory: False 16 | lr: 0.001 17 | momentum: 0.9 18 | fp16_allreduce: False 19 | use_adasum: False 20 | gradient_predivide_factor: 1.0 21 | epochs: 2 22 | strategy: ddp 23 | test_data_path: mnist-sample-data 24 | inference_model_mlflow_uri: mnist-pre-trained.pth 25 | predictions_dir: mnist-predictions 26 | predictions_file: predictions.csv 27 | class_labels: null 28 | 29 | # Workflows configuration 30 | training_pipeline: 31 | _target_: itwinai.pipeline.Pipeline 32 | steps: 33 | dataloading_step: 34 | _target_: dataloader.MNISTDataModuleTorch 35 | save_path: ${dataset_root} 36 | 37 | training_step: 38 | _target_: itwinai.torch.trainer.TorchTrainer 39 | config: 40 | batch_size: ${batch_size} 41 | num_workers: ${num_workers_dataloader} 42 | pin_memory: ${pin_memory} 43 | lr: ${lr} 44 | momentum: ${momentum} 45 | fp16_allreduce: ${fp16_allreduce} 46 | use_adasum: ${use_adasum} 47 | gradient_predivide_factor: ${gradient_predivide_factor} 48 | 49 | model: 50 | _target_: model.Net 51 | epochs: ${epochs} 52 | metrics: 53 | accuracy: 54 | _target_: torchmetrics.classification.MulticlassAccuracy 55 | num_classes: ${num_classes} 56 | precision: 57 | _target_: torchmetrics.classification.MulticlassPrecision 58 | num_classes: ${num_classes} 59 | recall: 60 | _target_: torchmetrics.classification.MulticlassRecall 61 | num_classes: ${num_classes} 62 | logger: 63 | _target_: itwinai.loggers.LoggersCollection 64 | loggers: 65 | - _target_: itwinai.loggers.ConsoleLogger 66 | log_freq: 10000 67 | - _target_: itwinai.loggers.MLFlowLogger 68 | experiment_name: MNIST classifier 69 | log_freq: batch 70 | strategy: ${strategy} 71 | # checkpoint_every: 1 72 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | # This CITATION.cff file was generated with cffinit. 2 | # Visit https://bit.ly/cffinit to generate yours today! 3 | 4 | cff-version: 1.2.0 5 | title: itwinai 6 | message: >- 7 | If you use this software, please cite it using the 8 | metadata from this file. 9 | type: software 10 | 11 | # The oder reflects the contributions to the repository to date 12 | # Also including supervisors who attended the meetings 13 | authors: 14 | - given-names: Matteo 15 | family-names: Bunino 16 | email: matteo.bunino@cern.ch 17 | affiliation: CERN 18 | orcid: 'https://orcid.org/0009-0008-5100-9300' 19 | - given-names: Rakesh 20 | family-names: Sarma 21 | email: r.sarma@fz-juelich.de 22 | affiliation: FZ Jülich 23 | - given-names: Jarl Sondre 24 | family-names: Sæther 25 | email: jarl.sondre.saether@cern.ch 26 | affiliation: CERN 27 | - given-names: Anna Elisa 28 | family-names: Lappe 29 | email: anna.elisa.lappe@cern.ch 30 | affiliation: CERN 31 | - given-names: Kalliopi 32 | family-names: Tsolaki 33 | email: kalliopi.tsolaki@cern.ch 34 | affiliation: CERN 35 | - given-names: Killian 36 | family-names: Verder 37 | email: killian.verder@cern.ch 38 | affiliation: CERN 39 | - given-names: Henry 40 | family-names: Mutegeki 41 | email: henry.mutegeki@cern.ch 42 | affiliation: CERN 43 | - given-names: Roman 44 | family-names: Machacek 45 | email: roman.machacek@cern.ch 46 | affiliation: CERN 47 | - given-names: Alexander 48 | family-names: Zoechbauer 49 | email: alexander.zoechbauer@cern.ch 50 | affiliation: CERN 51 | - given-names: Mario 52 | family-names: Ruettgers 53 | email: m.ruettgers@fz-juelich.de 54 | affiliation: FZ Jülich 55 | - given-names: Ilaria 56 | family-names: Luise 57 | email: ilaria.luise@cern.ch 58 | affiliation: CERN 59 | - given-names: Eric 60 | family-names: Wulff 61 | email: eric.wulff@cern.ch 62 | affiliation: CERN 63 | - given-names: Maria 64 | family-names: Girone 65 | email: maria.girone@cern.ch 66 | affiliation: CERN 67 | - given-names: Andreas 68 | family-names: Lintermann 69 | email: a.lintermann@fz-juelich.de 70 | affiliation: FZ Jülich 71 | repository-code: 'https://github.com/interTwin-eu/itwinai' 72 | url: 'https://itwinai.readthedocs.io/' 73 | abstract: AI on cloud and HPC made simple for science 74 | keywords: 75 | - Artificial intelligence 76 | - Machine learning 77 | - Digital twins 78 | - Climate research 79 | - Physics research 80 | license: MIT 81 | --------------------------------------------------------------------------------