├── .dockerignore ├── .github ├── renovate.json5 └── workflows │ ├── build.yml │ ├── deploy_docs.yml │ ├── nightly.yml │ ├── publish_container.yml │ ├── release.yml │ └── test.yml ├── .gitignore ├── CHANGELOG.md ├── Cargo.lock ├── Cargo.toml ├── Dockerfile ├── LICENSE ├── README.md ├── benchmarks ├── README.md ├── experiment-alternative-resources.py ├── experiment-dask.py ├── experiment-encryption-overhead.py ├── experiment-fractional-resources.py ├── experiment-io-streaming.py ├── experiment-numa.py ├── experiment-per-task-overhead.py ├── experiment-scalability-stress.py ├── experiment-scalability.py ├── experiment-server-cpu-util.py ├── experiment-total-overhead.py ├── main.py ├── postprocess.py ├── requirements.txt ├── src │ ├── __init__.py │ ├── analysis │ │ ├── chart.py │ │ └── dataframe.py │ ├── benchmark │ │ ├── __init__.py │ │ ├── database.py │ │ ├── identifier.py │ │ ├── result.py │ │ └── runner.py │ ├── benchmark_defs.py │ ├── build │ │ ├── hq.py │ │ └── repository.py │ ├── cli.py │ ├── clusterutils │ │ ├── __init__.py │ │ ├── cluster_helper.py │ │ ├── node_list.py │ │ └── profiler.py │ ├── environment │ │ ├── __init__.py │ │ ├── dask.py │ │ ├── hq.py │ │ ├── snake.py │ │ └── utils.py │ ├── executor │ │ ├── executor.py │ │ ├── executor_script.py │ │ ├── external_executor.py │ │ ├── local_executor.py │ │ └── serialization.py │ ├── monitoring │ │ ├── monitor_script.py │ │ └── record.py │ ├── postprocessing │ │ ├── common.py │ │ ├── monitor.py │ │ ├── overview.py │ │ ├── report.py │ │ ├── serve.py │ │ └── templates │ │ │ ├── benchmark.html │ │ │ ├── compare_table.html │ │ │ ├── main.html │ │ │ ├── summary.html │ │ │ └── workload.html │ ├── submit │ │ ├── execute_script.py │ │ ├── options.py │ │ ├── slurm.py │ │ ├── submit.py │ │ └── utils.py │ ├── trace │ │ └── export.py │ ├── utils │ │ ├── __init__.py │ │ ├── benchmark.py │ │ ├── io.py │ │ ├── process.py │ │ └── timing.py │ └── workloads │ │ ├── __init__.py │ │ ├── empty.py │ │ ├── sleep.py │ │ ├── sleep_resources.py │ │ ├── stress.py │ │ ├── utils.py │ │ └── workload.py └── sw_upload.py ├── crates ├── hyperqueue │ ├── Cargo.toml │ ├── benches │ │ └── benchmark.rs │ └── src │ │ ├── bin │ │ └── hq.rs │ │ ├── client │ │ ├── autoalloc.rs │ │ ├── commands │ │ │ ├── autoalloc.rs │ │ │ ├── data.rs │ │ │ ├── doc.rs │ │ │ ├── job.rs │ │ │ ├── journal │ │ │ │ ├── mod.rs │ │ │ │ └── output.rs │ │ │ ├── mod.rs │ │ │ ├── outputlog.rs │ │ │ ├── server.rs │ │ │ ├── submit │ │ │ │ ├── command.rs │ │ │ │ ├── defs.rs │ │ │ │ ├── directives.rs │ │ │ │ ├── jobfile.rs │ │ │ │ └── mod.rs │ │ │ ├── wait.rs │ │ │ └── worker.rs │ │ ├── globalsettings.rs │ │ ├── job.rs │ │ ├── mod.rs │ │ ├── output │ │ │ ├── cli.rs │ │ │ ├── common.rs │ │ │ ├── json.rs │ │ │ ├── mod.rs │ │ │ ├── outputs.rs │ │ │ └── quiet.rs │ │ ├── resources.rs │ │ ├── server.rs │ │ ├── status.rs │ │ ├── task.rs │ │ └── utils.rs │ │ ├── common │ │ ├── arraydef.rs │ │ ├── arrayparser.rs │ │ ├── cli.rs │ │ ├── env.rs │ │ ├── error.rs │ │ ├── format.rs │ │ ├── idcounter.rs │ │ ├── manager │ │ │ ├── common.rs │ │ │ ├── info.rs │ │ │ ├── mod.rs │ │ │ ├── pbs.rs │ │ │ └── slurm.rs │ │ ├── mod.rs │ │ ├── parser.rs │ │ ├── parser2.rs │ │ ├── placeholders.rs │ │ ├── rpc.rs │ │ ├── serialization.rs │ │ ├── serverdir.rs │ │ ├── setup.rs │ │ └── utils │ │ │ ├── controlflow.rs │ │ │ ├── fs.rs │ │ │ ├── mod.rs │ │ │ ├── network.rs │ │ │ ├── str.rs │ │ │ └── time.rs │ │ ├── dashboard │ │ ├── data │ │ │ ├── data.rs │ │ │ ├── fetch.rs │ │ │ ├── mod.rs │ │ │ ├── time_based_vec.rs │ │ │ ├── time_interval.rs │ │ │ └── timelines │ │ │ │ ├── alloc_timeline.rs │ │ │ │ ├── job_timeline.rs │ │ │ │ ├── mod.rs │ │ │ │ └── worker_timeline.rs │ │ ├── mod.rs │ │ ├── ui │ │ │ ├── mod.rs │ │ │ ├── screen.rs │ │ │ ├── screens │ │ │ │ ├── autoalloc │ │ │ │ │ ├── alloc_timeline_chart.rs │ │ │ │ │ ├── allocations_info_table.rs │ │ │ │ │ ├── mod.rs │ │ │ │ │ ├── queue_info_table.rs │ │ │ │ │ └── queue_params_display.rs │ │ │ │ ├── cluster │ │ │ │ │ ├── mod.rs │ │ │ │ │ ├── overview │ │ │ │ │ │ ├── mod.rs │ │ │ │ │ │ ├── worker_count_chart.rs │ │ │ │ │ │ └── worker_table.rs │ │ │ │ │ └── worker │ │ │ │ │ │ ├── cpu_util_table.rs │ │ │ │ │ │ ├── mod.rs │ │ │ │ │ │ ├── worker_config_table.rs │ │ │ │ │ │ └── worker_utilization_chart.rs │ │ │ │ ├── jobs │ │ │ │ │ ├── job_info_display.rs │ │ │ │ │ ├── job_tasks_chart.rs │ │ │ │ │ ├── jobs_table.rs │ │ │ │ │ ├── mod.rs │ │ │ │ │ └── overview.rs │ │ │ │ ├── mod.rs │ │ │ │ └── root_screen.rs │ │ │ ├── styles.rs │ │ │ ├── terminal.rs │ │ │ └── widgets │ │ │ │ ├── chart.rs │ │ │ │ ├── filled_rectangle.rs │ │ │ │ ├── mod.rs │ │ │ │ ├── progressbar.rs │ │ │ │ ├── table.rs │ │ │ │ ├── tasks_table.rs │ │ │ │ └── text.rs │ │ ├── ui_loop.rs │ │ └── utils.rs │ │ ├── lib.rs │ │ ├── server │ │ ├── autoalloc │ │ │ ├── config.rs │ │ │ ├── mod.rs │ │ │ ├── process.rs │ │ │ ├── queue │ │ │ │ ├── common.rs │ │ │ │ ├── mod.rs │ │ │ │ ├── pbs.rs │ │ │ │ └── slurm.rs │ │ │ ├── service.rs │ │ │ └── state.rs │ │ ├── backend.rs │ │ ├── bootstrap.rs │ │ ├── client │ │ │ ├── autoalloc.rs │ │ │ ├── mod.rs │ │ │ └── submit.rs │ │ ├── event │ │ │ ├── journal │ │ │ │ ├── mod.rs │ │ │ │ ├── prune.rs │ │ │ │ ├── read.rs │ │ │ │ ├── stream.rs │ │ │ │ └── write.rs │ │ │ ├── mod.rs │ │ │ ├── payload.rs │ │ │ └── streamer.rs │ │ ├── job.rs │ │ ├── mod.rs │ │ ├── restore.rs │ │ ├── state.rs │ │ ├── tako_events.rs │ │ └── worker.rs │ │ ├── stream │ │ ├── mod.rs │ │ └── reader │ │ │ ├── mod.rs │ │ │ └── outputlog.rs │ │ ├── tests │ │ ├── mod.rs │ │ ├── server.rs │ │ └── utils.rs │ │ ├── transfer │ │ ├── auth.rs │ │ ├── connection.rs │ │ ├── messages.rs │ │ ├── mod.rs │ │ ├── protocol.rs │ │ └── stream.rs │ │ └── worker │ │ ├── bootstrap.rs │ │ ├── hwdetect.rs │ │ ├── mod.rs │ │ ├── parser.rs │ │ ├── start │ │ ├── mod.rs │ │ └── program.rs │ │ └── streamer.rs ├── pyhq │ ├── Cargo.toml │ ├── README.md │ ├── pyproject.toml │ ├── python │ │ └── hyperqueue │ │ │ ├── __init__.py │ │ │ ├── client.py │ │ │ ├── cluster │ │ │ └── __init__.py │ │ │ ├── common.py │ │ │ ├── ffi │ │ │ ├── __init__.py │ │ │ ├── client.py │ │ │ ├── cluster.py │ │ │ └── protocol.py │ │ │ ├── job.py │ │ │ ├── output.py │ │ │ ├── task │ │ │ ├── __init__.py │ │ │ ├── function │ │ │ │ ├── __init__.py │ │ │ │ └── wrapper.py │ │ │ ├── program.py │ │ │ └── task.py │ │ │ ├── utils │ │ │ ├── __init__.py │ │ │ ├── package.py │ │ │ └── string.py │ │ │ ├── validation.py │ │ │ └── visualization.py │ └── src │ │ ├── client │ │ ├── job.rs │ │ ├── mod.rs │ │ └── server.rs │ │ ├── cluster │ │ ├── mod.rs │ │ ├── server.rs │ │ └── worker.rs │ │ ├── lib.rs │ │ ├── marshal.rs │ │ └── utils │ │ ├── error.rs │ │ └── mod.rs └── tako │ ├── Cargo.toml │ ├── README.md │ ├── benches │ ├── benchmark.rs │ ├── benchmarks │ │ ├── core.rs │ │ ├── mod.rs │ │ ├── scheduler.rs │ │ └── worker.rs │ └── utils │ │ └── mod.rs │ └── src │ ├── comm.rs │ ├── connection.rs │ ├── control.rs │ ├── events.rs │ ├── gateway.rs │ ├── hwstats.rs │ ├── internal │ ├── common │ │ ├── data_structures.rs │ │ ├── error.rs │ │ ├── ids.rs │ │ ├── index.rs │ │ ├── mod.rs │ │ ├── resources │ │ │ ├── allocation.rs │ │ │ ├── amount.rs │ │ │ ├── descriptor.rs │ │ │ ├── map.rs │ │ │ ├── mod.rs │ │ │ └── request.rs │ │ ├── rpc.rs │ │ ├── stablemap.rs │ │ ├── taskgroup.rs │ │ ├── trace.rs │ │ ├── utils.rs │ │ └── wrapped.rs │ ├── datasrv │ │ ├── dataobj.rs │ │ ├── datastorage.rs │ │ ├── download.rs │ │ ├── local_client.rs │ │ ├── messages.rs │ │ ├── mod.rs │ │ ├── test_utils.rs │ │ ├── tests.rs │ │ ├── upload.rs │ │ └── utils.rs │ ├── messages │ │ ├── auth.rs │ │ ├── common.rs │ │ ├── mod.rs │ │ └── worker.rs │ ├── mod.rs │ ├── scheduler │ │ ├── mod.rs │ │ ├── multinode.rs │ │ ├── query.rs │ │ └── state.rs │ ├── server │ │ ├── client.rs │ │ ├── comm.rs │ │ ├── core.rs │ │ ├── dataobj.rs │ │ ├── dataobjmap.rs │ │ ├── explain.rs │ │ ├── mod.rs │ │ ├── reactor.rs │ │ ├── rpc.rs │ │ ├── task.rs │ │ ├── taskmap.rs │ │ ├── worker.rs │ │ ├── workergroup.rs │ │ ├── workerload.rs │ │ └── workermap.rs │ ├── tests │ │ ├── integration │ │ │ ├── mod.rs │ │ │ ├── test_basic.rs │ │ │ ├── test_resources.rs │ │ │ ├── test_secret.rs │ │ │ ├── test_worker.rs │ │ │ └── utils │ │ │ │ ├── api.rs │ │ │ │ ├── mod.rs │ │ │ │ ├── server.rs │ │ │ │ ├── task.rs │ │ │ │ └── worker.rs │ │ ├── mod.rs │ │ ├── test_query.rs │ │ ├── test_reactor.rs │ │ ├── test_scheduler_mn.rs │ │ ├── test_scheduler_sn.rs │ │ ├── test_worker.rs │ │ └── utils │ │ │ ├── env.rs │ │ │ ├── mod.rs │ │ │ ├── resources.rs │ │ │ ├── schedule.rs │ │ │ ├── shared.rs │ │ │ ├── task.rs │ │ │ ├── worker.rs │ │ │ └── workflows.rs │ ├── transfer │ │ ├── auth.rs │ │ ├── mod.rs │ │ └── transport.rs │ └── worker │ │ ├── comm.rs │ │ ├── configuration.rs │ │ ├── data │ │ ├── download.rs │ │ ├── localcomm.rs │ │ ├── mod.rs │ │ └── upload.rs │ │ ├── hwmonitor │ │ ├── amd.rs │ │ ├── mod.rs │ │ └── nvidia.rs │ │ ├── localcomm.rs │ │ ├── mod.rs │ │ ├── reactor.rs │ │ ├── resources │ │ ├── allocator.rs │ │ ├── concise.rs │ │ ├── map.rs │ │ ├── mod.rs │ │ └── pool.rs │ │ ├── rpc.rs │ │ ├── rqueue.rs │ │ ├── state.rs │ │ ├── task.rs │ │ ├── task_comm.rs │ │ └── test_util.rs │ ├── launcher.rs │ ├── lib.rs │ └── program.rs ├── docs ├── README.md ├── changelog.md ├── cheatsheet.md ├── cli-reference │ └── .gitkeep ├── cli │ ├── dashboard.md │ ├── output-mode.md │ └── shortcuts.md ├── deployment │ ├── allocation.md │ ├── cloud.md │ ├── index.md │ ├── server.md │ └── worker.md ├── faq.md ├── imgs │ ├── architecture-bg.png │ ├── architecture.png │ ├── architecture.svg │ ├── cheatsheet.png │ ├── cheatsheet.svg │ ├── dashboard.gif │ ├── hq-comparison-table.png │ ├── hq.png │ ├── schema.png │ ├── schema.svg │ ├── streaming.png │ └── streaming.svg ├── installation.md ├── jobs │ ├── arrays.md │ ├── cresources.md │ ├── directives.md │ ├── explain.md │ ├── failure.md │ ├── jobfile.md │ ├── jobs.md │ ├── multinode.md │ ├── openjobs.md │ ├── resources.md │ └── streaming.md ├── other-tools.md ├── overrides │ └── main.html ├── python │ ├── client.md │ ├── dependencies.md │ ├── index.md │ └── submit.md ├── quickstart.md ├── requirements.txt └── stylesheets │ └── extra.css ├── examples ├── README.md └── iterative-computation │ └── README.md ├── mkdocs.yml ├── nedoc.conf ├── pytest.ini ├── ruff.toml ├── scripts ├── bless_tests.sh ├── check.sh ├── check_package_versions.py ├── docs │ ├── build_cli_reference.py │ └── copy_examples.py ├── extract_changelog.py ├── get_docs_version.py └── print_vers.py └── tests ├── README.md ├── __init__.py ├── autoalloc ├── __init__.py ├── conftest.py ├── flavor.py ├── mock │ ├── __init__.py │ ├── manager.py │ ├── mock.py │ ├── pbs.py │ └── slurm.py ├── test_autoalloc.py ├── test_cli.py ├── test_dryrun.py ├── test_native.py └── utils.py ├── conftest.py ├── job ├── __init__.py ├── test_file_cleanup.py ├── test_job_cat.py └── test_job_forget.py ├── output ├── __init__.py ├── test_json.py └── test_quiet.py ├── pyapi ├── __init__.py ├── binding │ ├── __init__.py │ └── test_server.py ├── test_cluster.py ├── test_dependencies.py ├── test_function.py ├── test_job.py └── test_visualization.py ├── pytest.ini ├── requirements.txt ├── test_array.py ├── test_cpus.py ├── test_datalayer.py ├── test_directives.py ├── test_entries.py ├── test_events.py ├── test_explain.py ├── test_job.py ├── test_job_mn.py ├── test_jobfile.py ├── test_journal.py ├── test_manager.py ├── test_placeholders.py ├── test_resources.py ├── test_server.py ├── test_stream.py ├── test_task.py ├── test_task_cleanup.py ├── test_time.py ├── test_utils.py ├── test_worker.py └── utils ├── __init__.py ├── cmd.py ├── io.py ├── job.py ├── mock.py ├── table.py └── wait.py /.dockerignore: -------------------------------------------------------------------------------- 1 | target/ 2 | -------------------------------------------------------------------------------- /.github/renovate.json5: -------------------------------------------------------------------------------- 1 | { 2 | $schema: "https://docs.renovatebot.com/renovate-schema.json", 3 | extends: [ 4 | "config:recommended", 5 | // Enable the dependency dashboard issue 6 | ":dependencyDashboard", 7 | ], 8 | "schedule": [ 9 | "at 7:00am on monday" 10 | ], 11 | // Group Rust updates into a single PR 12 | "packageRules": [ 13 | { 14 | "matchManagers": [ 15 | "cargo" 16 | ], 17 | "matchUpdateTypes": [ 18 | "minor", 19 | "patch" 20 | ], 21 | "groupName": "Rust non-major dependencies", 22 | "groupSlug": "rust-minor-patch" 23 | } 24 | ] 25 | } 26 | -------------------------------------------------------------------------------- /.github/workflows/deploy_docs.yml: -------------------------------------------------------------------------------- 1 | name: Deploy documentation 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | tags: 8 | - 'v*' 9 | 10 | concurrency: docs 11 | 12 | jobs: 13 | deploy: 14 | runs-on: ubuntu-latest 15 | if: github.repository_owner == 'It4innovations' 16 | steps: 17 | - uses: actions/checkout@v4 18 | with: 19 | # Download all git history to enable git revision history display in docs pages 20 | fetch-depth: 0 21 | - name: Install stable toolchain 22 | uses: actions-rs/toolchain@v1 23 | with: 24 | profile: minimal 25 | toolchain: 1.87.0 26 | override: true 27 | components: clippy, rustfmt 28 | - uses: Swatinem/rust-cache@v2 29 | - name: Set up Python 30 | uses: actions/setup-python@v2 31 | with: 32 | python-version: 3.9 33 | - name: Install dependencies 34 | run: | 35 | python -m pip install --upgrade pip wheel setuptools 36 | python -m pip install -r docs/requirements.txt 37 | - name: Install cli_doc 38 | run: cargo install --git https://github.com/spirali/cli_doc 39 | - name: Build HyperQueue 40 | run: cargo build 41 | - name: Build docs 42 | run: mkdocs build 43 | - name: Set Git CI config 44 | run: | 45 | git config user.name gh-ci-deploy-docs 46 | git config user.email gh-ci-deploy-docs@github.com 47 | - name: Calculate docs version 48 | run: | 49 | python3 scripts/get_docs_version.py > version.json 50 | cat version.json 51 | echo "VERSION=$(cat version.json)" >> $GITHUB_ENV 52 | - name: Deploy latest docs 53 | if: fromJson(env.VERSION).type == 'latest' 54 | run: mike deploy --push latest 55 | - name: Deploy stable docs 56 | if: fromJson(env.VERSION).type == 'stable' 57 | run: mike deploy --push -u ${{ fromJson(env.VERSION).version }} stable 58 | -------------------------------------------------------------------------------- /.github/workflows/nightly.yml: -------------------------------------------------------------------------------- 1 | name: Create nightly build release 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: "0 23 * * *" 7 | jobs: 8 | set-env: 9 | runs-on: ubuntu-latest 10 | outputs: 11 | version: ${{ env.HQ_VERSION }} 12 | steps: 13 | - name: Checkout sources 14 | uses: actions/checkout@v4 15 | - name: Set HQ nightly version 16 | run: | 17 | echo "HQ_VERSION=nightly-$(date +'%Y-%m-%d')-${{ github.sha }}" >> $GITHUB_ENV 18 | - name: Print HQ nightly version 19 | run: | 20 | echo "HQ version: ${{ env.HQ_VERSION }}" 21 | build-artifacts: 22 | needs: [ set-env ] 23 | uses: ./.github/workflows/build.yml 24 | if: github.repository_owner == 'It4innovations' 25 | with: 26 | version: ${{ needs.set-env.outputs.version }} 27 | create-tag: 28 | runs-on: ubuntu-latest 29 | needs: [ set-env, build-artifacts ] 30 | steps: 31 | - name: Checkout sources 32 | uses: actions/checkout@v4 33 | - name: Create tag 34 | uses: rickstaa/action-create-tag@v1 35 | with: 36 | tag: nightly 37 | force_push_tag: true 38 | message: Nightly build ${{ needs.set-env.outputs.version }} 39 | create-release: 40 | runs-on: ubuntu-latest 41 | needs: [ create-tag ] 42 | steps: 43 | - name: Checkout sources 44 | uses: actions/checkout@v4 45 | 46 | - name: Generate changelog 47 | run: python3 scripts/extract_changelog.py DEV > generated-changelog.md 48 | 49 | - name: Download artifacts 50 | uses: actions/download-artifact@v4 51 | 52 | - name: Prepare release name 53 | run: | 54 | echo "RELEASE_NAME=Nightly build $(date +'%Y-%m-%d')" >> $GITHUB_ENV 55 | 56 | - name: Create release 57 | uses: ncipollo/release-action@v1 58 | id: create-release 59 | with: 60 | bodyFile: generated-changelog.md 61 | token: ${{ secrets.GITHUB_TOKEN }} 62 | allowUpdates: true 63 | name: ${{ env.RELEASE_NAME }} 64 | prerelease: true 65 | tag: nightly 66 | commit: ${{ github.sha }} 67 | artifacts: archive-*/** 68 | removeArtifacts: true 69 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | .idea 3 | __pycache__ 4 | *.so 5 | *snap 6 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "crates/hyperqueue", 4 | "crates/tako", 5 | "crates/pyhq" 6 | ] 7 | default-members = [ 8 | "crates/hyperqueue", 9 | "crates/tako" 10 | ] 11 | resolver = "2" 12 | 13 | [workspace.package] 14 | rust-version = "1.87.0" 15 | edition = "2024" 16 | authors = ["Ada Böhm ", "Jakub Beránek "] 17 | 18 | [workspace.dependencies] 19 | tokio = "1" 20 | log = "0.4" 21 | env_logger = { version = "0.11", features = ["color"] } 22 | clap = "4" 23 | criterion = { version = "0.5", features = ["html_reports"] } 24 | derive_builder = "0.20" 25 | serde = { version = "1", features = ["rc"] } 26 | serde_json = "1" 27 | serde_bytes = "0.11" 28 | bytes = "1" 29 | chrono = "0.4" 30 | orion = "0.17" 31 | smallvec = "1" 32 | bincode = "1" 33 | futures = "0.3" 34 | tokio-util = "0.7" 35 | hex = "0.4" 36 | rand = "0.9" 37 | gethostname = "1.0" 38 | thiserror = "2" 39 | tempfile = "3.12.0" 40 | tracing = "0.1" 41 | anyhow = "1" 42 | nix = { version = "0.29", features = ["process", "signal"] } 43 | bstr = { version = "1", features = ["serde"] } 44 | psutil = "3" 45 | thin-vec = { version = "0.2", features = ["serde"] } 46 | bitflags = { version = "2", features = ["serde"] } 47 | 48 | [workspace.lints.clippy] 49 | dbg_macro = "deny" 50 | 51 | [profile.release] 52 | panic = "abort" 53 | 54 | # Profile designed for the most optimized release build that is distributed 55 | # to users. 56 | [profile.dist] 57 | inherits = "release" 58 | lto = true 59 | codegen-units = 1 60 | debug = "line-tables-only" -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM lukemathwalker/cargo-chef:latest-rust-1 AS chef 2 | 3 | ENV CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse 4 | WORKDIR /app 5 | 6 | FROM chef as planner 7 | 8 | COPY . . 9 | RUN cargo chef prepare --recipe-path recipe.json 10 | 11 | FROM chef AS builder 12 | WORKDIR /build 13 | COPY --from=planner /app/recipe.json recipe.json 14 | 15 | # Build dependencies and cache them in a Docker layer 16 | RUN cargo chef cook --release --recipe-path recipe.json 17 | 18 | # Build HyperQueue itself 19 | COPY . . 20 | RUN cargo build --release 21 | 22 | FROM ubuntu:22.04 AS runtime 23 | 24 | WORKDIR / 25 | COPY --from=builder /build/target/release/hq hq 26 | 27 | ENTRYPOINT ["./hq"] 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021-present, Ada Böhm, Jakub Beranek 4 | 5 | Permission is hereby granted, free of charge, to any person 6 | obtaining a copy of this software and associated documentation 7 | files (the "Software"), to deal in the Software without 8 | restriction, including without limitation the rights to use, 9 | copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the 11 | Software is furnished to do so, subject to the following 12 | conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 | OTHER DEALINGS IN THE SOFTWARE. 25 | -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Benchmark suite 2 | This directory contains a framework for running various benchmarks. 3 | 4 | It has support for spawning a distributed cluster for various tools (HQ, SnakeMake, ...), along with 5 | node monitoring and profiling. Some features are only available for HyperQueue clusters. 6 | 7 | The results of benchmarks are stored into JSON files, which can be used to generate HTML dashboards. 8 | 9 | ## Quick start 10 | The benchmarks are meant to be launched from Python code. You can find some examples in `main.py`. 11 | To compare HyperQueue with zero-worker and with normal worker, you can run: 12 | ```bash 13 | $ python main.py compare-zw 14 | ``` 15 | The results will be stored into `benchmarks/zw`. 16 | 17 | ## Available profilers 18 | You can attach various profilers to the HyperQueue server or the workers. Use the `server_profilers` 19 | and/or `worker_profilers` attribute of `HqClusterInfo`. 20 | 21 | ### Flamegraph (`FlamegraphProfiler`) 22 | Uses `perf` for stack sampling, results are rendered as a flamegraph. 23 | 24 | ### Perf events (`PerfEventsProfiler`) 25 | Uses `perf stat` to gather various CPU performance events. 26 | 27 | ### Callgrind (`CallgrindProfiler`) 28 | Uses Callgrind to instrument the profiled binary. The results can be visualized e.g. using KCacheGrind. 29 | Note that using Callgrind can slow down the execution by orders of magnitude. 30 | 31 | ### Cachegrind (`CachegrindProfiler`) 32 | Uses Cachegrind to instrument the profiled binary. The results can be visualized e.g. using KCacheGrind. 33 | Note that using Cachegrind can slow down the execution by orders of magnitude. 34 | -------------------------------------------------------------------------------- /benchmarks/requirements.txt: -------------------------------------------------------------------------------- 1 | typer==0.9.0 2 | gitpython==3.1.41 3 | git+https://code.it4i.cz/def/cluster.git@3e2bcf58c0224bd0e0889c8a0f95957e4d969ca6 4 | pandas==1.3.3 5 | tqdm==4.66.3 6 | pyserde==0.12.3 7 | psutil==5.8.0 8 | humanize==3.12.0 9 | git+https://github.com/it4innovations/snailwatch@4d590c55e6b1e404e0398e8005dd998f5bc50be9#subdirectory=client 10 | jinja2==3.1.6 11 | matplotlib==3.6.2 12 | distributed==2023.11.0 13 | dask==2023.11.0 14 | seaborn==0.13.0 15 | bokeh==2.4.3 16 | -------------------------------------------------------------------------------- /benchmarks/src/__init__.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | CURRENT_DIR = Path(__file__).absolute().parent 4 | ROOT_DIR = CURRENT_DIR.parent.parent 5 | -------------------------------------------------------------------------------- /benchmarks/src/analysis/chart.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from matplotlib import pyplot as plt 4 | 5 | 6 | def render_chart(path: Path): 7 | assert path.suffix == "" 8 | plt.savefig(f"{path}.png") 9 | plt.savefig(f"{path}.pdf") 10 | -------------------------------------------------------------------------------- /benchmarks/src/analysis/dataframe.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from typing import List, Any, Callable, Tuple 3 | 4 | import pandas as pd 5 | 6 | from ..benchmark.database import Database, DatabaseRecord 7 | 8 | 9 | class DataFrameExtractor: 10 | def __init__(self, database: Database): 11 | self.database = database 12 | self.keys: List[str] = [] 13 | self.transforms: List[Tuple[str, Callable[[DatabaseRecord], Any]]] = [] 14 | 15 | def extract(self, *args: str) -> "DataFrameExtractor": 16 | self.keys.extend(args) 17 | return self 18 | 19 | def transform(self, key: str, transform: Callable[[DatabaseRecord], Any]) -> "DataFrameExtractor": 20 | self.transforms.append((key, transform)) 21 | return self 22 | 23 | def build(self) -> pd.DataFrame: 24 | records = defaultdict(list) 25 | 26 | keys = frozenset(self.keys) 27 | for record in self.database.records: 28 | for key in keys: 29 | records[key].append(getattr(record, key)) 30 | for key, transform in self.transforms: 31 | records[key].append(transform(record)) 32 | return pd.DataFrame(records) 33 | -------------------------------------------------------------------------------- /benchmarks/src/benchmark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/It4innovations/hyperqueue/aecde5c53dd41843cc566d87e6fe8036ba26a8fa/benchmarks/src/benchmark/__init__.py -------------------------------------------------------------------------------- /benchmarks/src/benchmark/result.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | 3 | 4 | @dataclasses.dataclass(frozen=True) 5 | class BenchmarkResult: 6 | pass 7 | 8 | 9 | @dataclasses.dataclass(frozen=True) 10 | class Timeout(BenchmarkResult): 11 | timeout: float 12 | 13 | def __repr__(self): 14 | return f"Timeout after {self.timeout}s" 15 | 16 | 17 | @dataclasses.dataclass(frozen=True) 18 | class Failure(BenchmarkResult): 19 | traceback: str 20 | 21 | def __repr__(self): 22 | return f"Failure: {self.traceback}" 23 | 24 | 25 | @dataclasses.dataclass(frozen=True) 26 | class Success(BenchmarkResult): 27 | duration: float 28 | 29 | def __repr__(self): 30 | return f"Success: {self.duration}s" 31 | -------------------------------------------------------------------------------- /benchmarks/src/build/repository.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import logging 3 | 4 | from git import Repo 5 | from git.repo.fun import rev_parse 6 | 7 | from .. import ROOT_DIR 8 | 9 | REPO = Repo(ROOT_DIR) 10 | 11 | # This tag represents the active git workspace 12 | TAG_WORKSPACE = "current" 13 | 14 | 15 | def resolve_tag(tag: str) -> str: 16 | if tag == TAG_WORKSPACE: 17 | return tag 18 | return rev_parse(REPO, tag).hexsha 19 | 20 | 21 | @contextlib.contextmanager 22 | def checkout_tag(tag: str): 23 | if tag == TAG_WORKSPACE: 24 | yield 25 | else: 26 | active_branch = REPO.active_branch 27 | 28 | logging.info("Stashing repository") 29 | msg = REPO.git.stash() 30 | try: 31 | aliases = REPO.git.name_rev(["--name-only", tag]).split() 32 | logging.info(f"Checking out {tag} ({', '.join(aliases)})") 33 | REPO.git.checkout(tag) 34 | yield 35 | finally: 36 | logging.info(f"Reverting to original state ({active_branch})") 37 | REPO.git.checkout(active_branch) 38 | if "No local changes to save" not in msg: 39 | REPO.git.stash("pop") 40 | -------------------------------------------------------------------------------- /benchmarks/src/clusterutils/__init__.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | 3 | from .node_list import NodeList 4 | 5 | 6 | @dataclasses.dataclass(frozen=True) 7 | class ClusterInfo: 8 | node_list: NodeList 9 | monitor_nodes: bool = False 10 | -------------------------------------------------------------------------------- /benchmarks/src/clusterutils/node_list.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import os 3 | import socket 4 | import subprocess 5 | from typing import List, Optional 6 | 7 | 8 | class NodeList(abc.ABC): 9 | def resolve(self) -> List[str]: 10 | raise NotImplementedError() 11 | 12 | def is_localhost(self) -> bool: 13 | return False 14 | 15 | 16 | class Local(NodeList): 17 | HOSTNAME = socket.gethostname() 18 | 19 | def resolve(self) -> List[str]: 20 | return [Local.HOSTNAME] 21 | 22 | def is_localhost(self) -> bool: 23 | return True 24 | 25 | 26 | class PBS(NodeList): 27 | def resolve(self) -> List[str]: 28 | return get_pbs_nodes() 29 | 30 | 31 | def is_inside_pbs() -> bool: 32 | return "PBS_NODEFILE" in os.environ 33 | 34 | 35 | def get_pbs_nodes() -> List[str]: 36 | assert is_inside_pbs() 37 | 38 | with open(os.environ["PBS_NODEFILE"]) as f: 39 | return [line.strip() for line in f] 40 | 41 | 42 | class Slurm(NodeList): 43 | def resolve(self) -> List[str]: 44 | return get_slurm_nodes() 45 | 46 | 47 | def get_slurm_nodes() -> List[str]: 48 | assert is_inside_slurm() 49 | output = subprocess.check_output(["scontrol", "show", "hostnames"]) 50 | return [node.strip() for node in output.decode().split("\n") if node.strip()] 51 | 52 | 53 | def is_inside_slurm() -> bool: 54 | return "SLURM_NODELIST" in os.environ 55 | 56 | 57 | class Explicit(NodeList): 58 | def __init__(self, nodes: List[str]): 59 | self.nodes = nodes 60 | 61 | def resolve(self) -> List[str]: 62 | return self.nodes 63 | 64 | 65 | def get_active_nodes() -> NodeList: 66 | if is_inside_pbs(): 67 | return PBS() 68 | elif is_inside_slurm(): 69 | return Slurm() 70 | else: 71 | return Local() 72 | 73 | 74 | def get_slurm_allocation_id() -> Optional[str]: 75 | return os.environ.get("SLURM_JOB_ID") 76 | -------------------------------------------------------------------------------- /benchmarks/src/environment/__init__.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Any, Dict 3 | 4 | 5 | class Environment: 6 | def start(self): 7 | raise NotImplementedError 8 | 9 | def stop(self): 10 | raise NotImplementedError 11 | 12 | def __enter__(self): 13 | self.start() 14 | return self 15 | 16 | def __exit__(self, exc_type, exc_val, exc_tb): 17 | self.stop() 18 | 19 | 20 | class EnvironmentDescriptor: 21 | """ 22 | This class should describe an instance of an environment. 23 | The class has to be easily picklable and able to create new environments. 24 | It also has to be able to describe itself using metadata. 25 | """ 26 | 27 | def create_environment(self, workdir: Path) -> Environment: 28 | raise NotImplementedError 29 | 30 | def name(self) -> str: 31 | raise NotImplementedError 32 | 33 | def parameters(self) -> Dict[str, Any]: 34 | raise NotImplementedError 35 | 36 | def metadata(self) -> Dict[str, Any]: 37 | return {} 38 | -------------------------------------------------------------------------------- /benchmarks/src/environment/snake.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import logging 3 | from pathlib import Path 4 | from typing import Any, Dict 5 | 6 | from . import Environment, EnvironmentDescriptor 7 | from .utils import EnvStateManager 8 | 9 | 10 | @dataclasses.dataclass(frozen=True) 11 | class SnakeClusterInfo: 12 | workdir: Path 13 | 14 | 15 | class SnakeEnvironmentDescriptor(EnvironmentDescriptor): 16 | def create_environment(self, workdir: Path) -> Environment: 17 | info = SnakeClusterInfo(workdir) 18 | return SnakeEnvironment(info) 19 | 20 | def name(self) -> str: 21 | return "snake" 22 | 23 | def parameters(self) -> Dict[str, Any]: 24 | return {} 25 | 26 | def metadata(self) -> Dict[str, Any]: 27 | return {} 28 | 29 | 30 | class SnakeEnvironment(Environment, EnvStateManager): 31 | def __init__(self, info: SnakeClusterInfo): 32 | EnvStateManager.__init__(self) 33 | self.info = info 34 | self.snakefile = info.workdir / "Snakefile" 35 | 36 | @property 37 | def workdir(self) -> Path: 38 | return self.info.workdir 39 | 40 | def start(self): 41 | self.state_start() 42 | 43 | def stop(self): 44 | self.state_stop() 45 | 46 | def submit(self, cmds: str, cpus_per_task: int): 47 | logging.info(f"Starting Snakemake {cmds, cpus_per_task}") 48 | with open(self.snakefile, "w") as f: 49 | f.writelines(cmds) 50 | 51 | from snakemake import snakemake 52 | 53 | ret = snakemake( 54 | snakefile=str(self.snakefile), 55 | quiet=True, 56 | cores=cpus_per_task, 57 | workdir=str(self.workdir), 58 | ) 59 | if not ret: 60 | raise Exception( 61 | f"SnakeMake execution failed. You can find more details in {self.workdir / '.snakemake' / 'log'}" 62 | ) 63 | -------------------------------------------------------------------------------- /benchmarks/src/environment/utils.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from typing import Dict, List, Optional, Protocol 3 | 4 | 5 | class Init: 6 | pass 7 | 8 | 9 | class Started: 10 | pass 11 | 12 | 13 | class Stopped: 14 | pass 15 | 16 | 17 | class EnvStateManager: 18 | """ 19 | Helper mixin class that makes sure that an environment is used in the correct order and that 20 | it is not started/stopped multiple times. 21 | """ 22 | 23 | def __init__(self): 24 | self.state = Init() 25 | 26 | def state_start(self): 27 | assert isinstance(self.state, Init) 28 | self.state = Started() 29 | 30 | def state_stop(self): 31 | assert isinstance(self.state, Started) 32 | self.state = Stopped() 33 | 34 | 35 | def sanity_check_nodes(nodes: List[str]): 36 | for node in nodes: 37 | assert len(node) > 0 38 | assert len(set(nodes)) == len(nodes) 39 | assert len(nodes) > 0 40 | 41 | 42 | class WorkerConfig(Protocol): 43 | node: Optional[int] 44 | 45 | 46 | def assign_workers(workers: List[WorkerConfig], nodes: List[str]) -> Dict[str, List[WorkerConfig]]: 47 | round_robin_node = 0 48 | used_round_robin = set() 49 | 50 | node_assignments = defaultdict(list) 51 | for index, worker in enumerate(workers): 52 | node = worker.node 53 | if node is not None: 54 | if not (0 <= node < len(nodes)): 55 | raise Exception( 56 | f"Invalid node assignment. Worker {index} wants to be on node " 57 | f"{node}, but there are only {len(nodes)} worker nodes" 58 | ) 59 | else: 60 | node = round_robin_node 61 | round_robin_node = (round_robin_node + 1) % len(nodes) 62 | if node in used_round_robin: 63 | raise Exception(f"There are more workers ({len(workers)}) than worker nodes ({len(nodes)})") 64 | used_round_robin.add(node) 65 | if node >= len(nodes): 66 | raise Exception(f"Selected worker node is {node}, but there are only {len(nodes)} worker node(s)") 67 | node_assignments[nodes[node]].append(worker) 68 | return dict(node_assignments) 69 | -------------------------------------------------------------------------------- /benchmarks/src/executor/executor.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | from pathlib import Path 3 | 4 | from ..benchmark.identifier import BenchmarkDescriptor 5 | from ..benchmark.result import BenchmarkResult 6 | 7 | 8 | @dataclasses.dataclass 9 | class BenchmarkContext: 10 | workdir: Path 11 | timeout_s: float 12 | 13 | def __post_init__(self): 14 | self.workdir = self.workdir.resolve() 15 | 16 | 17 | class BenchmarkExecutor: 18 | def execute(self, benchmark: BenchmarkDescriptor, ctx: BenchmarkContext) -> BenchmarkResult: 19 | raise NotImplementedError 20 | -------------------------------------------------------------------------------- /benchmarks/src/executor/executor_script.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pickle 3 | import sys 4 | 5 | from ..utils import activate_cwd 6 | from .local_executor import execute_benchmark 7 | from .serialization import SerializedBenchmark, serialize_result 8 | 9 | if __name__ == "__main__": 10 | pipe_path = sys.argv[1] 11 | 12 | data = sys.stdin.buffer.read() 13 | benchmark = pickle.loads(data) 14 | assert isinstance(benchmark, SerializedBenchmark) 15 | 16 | with activate_cwd(benchmark.cwd): 17 | result = execute_benchmark(benchmark.descriptor, benchmark.ctx) 18 | serialized_result = serialize_result(result) 19 | with open(pipe_path, "w") as file: 20 | print(json.dumps(serialized_result), file=file) 21 | -------------------------------------------------------------------------------- /benchmarks/src/executor/local_executor.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | 3 | from ..benchmark.identifier import BenchmarkDescriptor 4 | from ..benchmark.result import BenchmarkResult, Failure, Success, Timeout 5 | from ..utils.timing import TimeoutException, with_timeout 6 | from ..workloads.workload import WorkloadExecutionResult 7 | from .executor import BenchmarkContext, BenchmarkExecutor 8 | 9 | 10 | class LocalBenchmarkExecutor(BenchmarkExecutor): 11 | """Executes benchmarks in the current process""" 12 | 13 | def execute(self, benchmark: BenchmarkDescriptor, ctx: BenchmarkContext) -> BenchmarkResult: 14 | return execute_benchmark(benchmark, ctx) 15 | 16 | 17 | def execute_benchmark(descriptor: BenchmarkDescriptor, ctx: BenchmarkContext) -> BenchmarkResult: 18 | env = descriptor.env_descriptor.create_environment(ctx.workdir) 19 | workload = descriptor.workload 20 | 21 | def run() -> WorkloadExecutionResult: 22 | return workload.execute(env) 23 | 24 | try: 25 | with env: 26 | result = with_timeout(run, timeout_s=ctx.timeout_s) 27 | return Success(duration=result.duration) 28 | except TimeoutException: 29 | return Timeout(ctx.timeout_s) 30 | except BaseException: 31 | return Failure(traceback.format_exc()) 32 | -------------------------------------------------------------------------------- /benchmarks/src/executor/serialization.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | from pathlib import Path 3 | from typing import Any, Dict 4 | 5 | from ..benchmark.identifier import BenchmarkDescriptor 6 | from ..benchmark.result import BenchmarkResult, Failure, Success, Timeout 7 | from .executor import BenchmarkContext 8 | 9 | 10 | @dataclasses.dataclass(frozen=True) 11 | class SerializedBenchmark: 12 | descriptor: BenchmarkDescriptor 13 | ctx: BenchmarkContext 14 | cwd: Path 15 | 16 | 17 | def serialize_result(result: BenchmarkResult) -> Dict[str, Any]: 18 | if isinstance(result, Success): 19 | type = "success" 20 | elif isinstance(result, Timeout): 21 | type = "timeout" 22 | elif isinstance(result, Failure): 23 | type = "failure" 24 | else: 25 | assert False 26 | return dict(type=type, data=result.to_dict()) 27 | 28 | 29 | def deserialize_result(data: Dict[str, Any]) -> BenchmarkResult: 30 | type = data["type"] 31 | data = data["data"] 32 | if type == "success": 33 | return Success.from_dict(data) 34 | elif type == "timeout": 35 | return Timeout.from_dict(data) 36 | elif type == "failure": 37 | return Failure.from_dict(data) 38 | else: 39 | assert False 40 | -------------------------------------------------------------------------------- /benchmarks/src/monitoring/monitor_script.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import shutil 4 | import sys 5 | import time 6 | 7 | import click 8 | import psutil 9 | from cluster.io import measure_and_store 10 | from record import generate_record, MonitoringOptions 11 | 12 | 13 | @click.command() 14 | @click.argument("output") 15 | @click.option("--capture-interval", default=1) 16 | @click.option("--dump-interval", default=10) 17 | @click.option("--observe-pids", default="") 18 | def main(output: str, capture_interval: int, dump_interval: int, observe_pids: str): 19 | options = MonitoringOptions(observe_network=False) 20 | 21 | processes = [] 22 | process_map = {} 23 | for pid in observe_pids.split(","): 24 | if pid: 25 | try: 26 | processes.append(psutil.Process(int(pid))) 27 | logging.info(f"Observing PID {pid}") 28 | except BaseException as e: 29 | logging.error(e) 30 | 31 | def capture(timestamp): 32 | try: 33 | start = time.time() 34 | result = generate_record(timestamp, processes, process_map, options) 35 | duration = time.time() - start 36 | logging.info(f"Capturing data took {duration:.5f}s") 37 | return result 38 | except Exception as e: 39 | logging.error("Opening cluster exception: {}".format(e)) 40 | return None 41 | 42 | def finish(): 43 | logging.info(f"Copying trace from {tmp_output} to {output}") 44 | shutil.copyfile(tmp_output, output) 45 | sys.exit() 46 | 47 | tmp_output = f"/tmp/{os.path.basename(output)}-{int(time.time())}" 48 | 49 | # Create temporary file 50 | with open(tmp_output, "w") as _: 51 | pass 52 | 53 | measure_and_store(capture_interval, dump_interval, tmp_output, capture, finish) 54 | 55 | 56 | if __name__ == "__main__": 57 | logging.basicConfig( 58 | level=logging.INFO, 59 | format="%(levelname)s:%(asctime)s:%(funcName)s: %(message)s", 60 | datefmt="%Y-%m-%d %H:%M:%S", 61 | ) 62 | main() 63 | -------------------------------------------------------------------------------- /benchmarks/src/postprocessing/templates/benchmark.html: -------------------------------------------------------------------------------- 1 | Duration: {{ "%.4f"|format(benchmark.record.duration) }} s 2 | {% if benchmark.process_stats %} 3 |

Process utilization

4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | {% for (k, v) in benchmark.process_stats.items() %} 13 | 14 | 15 | 16 | 17 | 18 | 19 | {% endfor %} 20 | 21 |
HostnameKeyAvg. CPUMax. RSS
{{ k[0] }}{{ k[1] }}{{ "%.2f"|format(v.avg_cpu) }} %{{ format_bytes(v.max_rss) }}
22 | {% endif %} 23 | {% if node_utilization %} 24 |

Node utilization

25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | {% for (hostname, data) in node_utilization.items() %} 33 | 34 | 35 | 36 | 37 | 38 | {% endfor %} 39 | 40 |
HostnameAvg. CPUAvg. memory
{{ hostname }}{{ "%.2f"|format(data["cpu"]) }} %{{ "%.2f"|format(data["memory"]) }} %
41 | {% endif %} 42 | {% if benchmark.monitoring_report %} 43 | Cluster report 44 | {% endif %} -------------------------------------------------------------------------------- /benchmarks/src/postprocessing/templates/compare_table.html: -------------------------------------------------------------------------------- 1 | 34 | 35 | {{tables}} 36 |
37 | 38 | 39 | -------------------------------------------------------------------------------- /benchmarks/src/postprocessing/templates/summary.html: -------------------------------------------------------------------------------- 1 | 2 |
3 | 32 |
33 | 34 | {% for group in keys %} 35 |

{{ group|e }}

36 |

37 | {% for x in data[group] %} 38 | {{ x|e }}
39 | {% endfor %} 40 |

41 | {% endfor %} 42 |

Grouped by benchmark:

43 | {% for name,group in data["Grouped by benchmark:"] %} 44 |

{{ name|e }}

45 | {{ group }}
46 | 47 | {% endfor %} 48 |

49 | 50 | 57 | -------------------------------------------------------------------------------- /benchmarks/src/postprocessing/templates/workload.html: -------------------------------------------------------------------------------- 1 | 19 | 20 |
21 | {% for key in environments %} 22 | {{ key }} 23 | {% endfor %} 24 |
25 | 26 | {% for key in environments %} 27 | 28 | {% endfor %} 29 | 30 | -------------------------------------------------------------------------------- /benchmarks/src/submit/options.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import datetime 3 | from pathlib import Path 4 | from typing import Optional 5 | 6 | from ..utils.io import from_json, to_json 7 | 8 | 9 | @dataclasses.dataclass(frozen=True) 10 | class PBSSubmitOptions: 11 | queue: str 12 | nodes: int 13 | walltime: datetime.timedelta 14 | project: Optional[str] = None 15 | name: Optional[str] = None 16 | init_script: Optional[Path] = None 17 | 18 | 19 | def serialize_submit_options(options: PBSSubmitOptions, path: Path): 20 | with open(path, "w") as f: 21 | to_json(options, f) 22 | 23 | 24 | def deserialize_submit_options(path: Path) -> PBSSubmitOptions: 25 | with open(path) as f: 26 | return from_json(PBSSubmitOptions, f) 27 | -------------------------------------------------------------------------------- /benchmarks/src/submit/utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from pathlib import Path 3 | 4 | 5 | def generate_job_dir(workdir: Path) -> Path: 6 | """Tries to find a directory in `workdir` which name is an integer and return a large integer 7 | padded. The returned name is padded by zeros.""" 8 | workdir.mkdir(parents=True, exist_ok=True) 9 | 10 | ids = [] 11 | for item in workdir.iterdir(): 12 | if item.is_dir(): 13 | try: 14 | ids.append(int(item.name)) 15 | except BaseException: 16 | pass 17 | max_id = max(ids or [0]) 18 | dir_name = f"{max_id + 1:03}" 19 | return (workdir / dir_name).absolute() 20 | 21 | 22 | def format_allocation_time(duration: datetime.timedelta) -> str: 23 | days, seconds = duration.days, duration.seconds 24 | hours = days * 24 + seconds // 3600 25 | minutes = (seconds % 3600) // 60 26 | seconds = seconds % 60 27 | 28 | return f"{hours:02}:{minutes:02}:{seconds:02}" 29 | -------------------------------------------------------------------------------- /benchmarks/src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import os 3 | import shutil 4 | from pathlib import Path 5 | 6 | 7 | def get_pyenv_from_env() -> str: 8 | return os.environ.get("VIRTUAL_ENV") 9 | 10 | 11 | def ensure_directory(path: Path) -> Path: 12 | path.mkdir(parents=True, exist_ok=True) 13 | return path.absolute() 14 | 15 | 16 | def check_file_exists(path: Path): 17 | if not path.exists(): 18 | raise Exception(f"Path {path} does not exist") 19 | if not path.is_file(): 20 | raise Exception(f"Path {path} is not a file") 21 | 22 | 23 | def is_binary_available(binary: str) -> bool: 24 | return shutil.which(binary) is not None 25 | 26 | 27 | @contextlib.contextmanager 28 | def activate_cwd(directory: Path): 29 | cwd = os.getcwd() 30 | os.chdir(directory) 31 | 32 | try: 33 | yield 34 | finally: 35 | os.chdir(cwd) 36 | -------------------------------------------------------------------------------- /benchmarks/src/utils/io.py: -------------------------------------------------------------------------------- 1 | import typing 2 | from typing import TypeVar 3 | 4 | Type = TypeVar("Type") 5 | 6 | 7 | def from_json(cls: type[Type], input: typing.Union[typing.TextIO, str]) -> Type: 8 | from serde import json 9 | 10 | if not isinstance(input, str): 11 | input = input.read() 12 | return json.from_json(cls, input) 13 | 14 | 15 | def to_json(object: typing.Any, file: typing.TextIO): 16 | from serde import json 17 | 18 | serialized = json.to_json(object) 19 | file.write(serialized) 20 | -------------------------------------------------------------------------------- /benchmarks/src/utils/process.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from pathlib import Path 3 | from typing import Dict, List, Optional 4 | 5 | 6 | def execute_process( 7 | args: List[str], 8 | stdout: Path, 9 | stderr: Path, 10 | env: Optional[Dict[str, str]] = None, 11 | check=True, 12 | ) -> subprocess.CompletedProcess: 13 | with open(stdout, "wb") as stdout_file: 14 | with open(stderr, "wb") as stderr_file: 15 | env = env or {} 16 | result = subprocess.run( 17 | args, 18 | env=env, 19 | stdin=subprocess.DEVNULL, 20 | stdout=stdout_file, 21 | stderr=stderr_file, 22 | ) 23 | if check: 24 | if result.returncode != 0: 25 | with open(stdout) as stdout_file: 26 | with open(stderr) as stderr_file: 27 | raise Exception( 28 | f"""The process {args} has exited with error code {result.returncode} 29 | Stdout: {stdout_file.read()} 30 | Stderr: {stderr_file.read()} 31 | """.strip() 32 | ) 33 | return result 34 | -------------------------------------------------------------------------------- /benchmarks/src/utils/timing.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | import contextlib 3 | import multiprocessing.context 4 | import time 5 | from multiprocessing.pool import ThreadPool 6 | from typing import Callable, TypeVar 7 | 8 | DEFAULT_TIMEOUT = 15 9 | 10 | 11 | class TimeoutException(BaseException): 12 | pass 13 | 14 | 15 | def wait_until(fn, sleep_s=0.5, timeout_s=DEFAULT_TIMEOUT): 16 | end = time.time() + timeout_s 17 | 18 | while time.time() < end: 19 | value = fn() 20 | if value is not None and value is not False: 21 | return value 22 | time.sleep(sleep_s) 23 | raise TimeoutException(f"Wait timeouted after {timeout_s} seconds") 24 | 25 | 26 | TIMEOUT_POOL = None 27 | T = TypeVar("T") 28 | 29 | 30 | def with_timeout(fn: Callable[..., T], timeout_s: float) -> T: 31 | global TIMEOUT_POOL 32 | 33 | if TIMEOUT_POOL is None: 34 | # it needs to be more than 1 to avoid deadlocks when with_timeout is nested 35 | TIMEOUT_POOL = ThreadPool(8) 36 | atexit.register(TIMEOUT_POOL.close) 37 | 38 | future = TIMEOUT_POOL.apply_async(fn) 39 | try: 40 | return future.get(timeout=timeout_s) 41 | except multiprocessing.context.TimeoutError: 42 | raise TimeoutException() 43 | 44 | 45 | class Timings: 46 | def __init__(self): 47 | self.timings = {} 48 | 49 | def add(self, name, duration): 50 | assert name not in self.timings 51 | self.timings[name] = duration 52 | 53 | def duration(self) -> float: 54 | return self.timings["duration"] 55 | 56 | def to_dict(self): 57 | return dict(self.timings) 58 | 59 | def __repr__(self): 60 | return repr(self.timings) 61 | 62 | @contextlib.contextmanager 63 | def time(self, name="duration"): 64 | start = time.time() 65 | yield 66 | duration = time.time() - start 67 | self.add(name, duration) 68 | -------------------------------------------------------------------------------- /benchmarks/src/workloads/__init__.py: -------------------------------------------------------------------------------- 1 | from .sleep import SleepHQ 2 | from .stress import StressHQ 3 | from .workload import Workload 4 | 5 | __all__ = [Workload, SleepHQ, StressHQ] 6 | -------------------------------------------------------------------------------- /benchmarks/src/workloads/empty.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any 2 | 3 | from .utils import measure_dask_tasks 4 | from .workload import Workload, WorkloadExecutionResult 5 | from ..environment.dask import DaskEnvironment 6 | 7 | 8 | def empty(): 9 | pass 10 | 11 | 12 | class EmptyDask(Workload): 13 | def __init__(self, task_count: int): 14 | self.task_count = task_count 15 | 16 | def name(self) -> str: 17 | return "empty" 18 | 19 | def parameters(self) -> Dict[str, Any]: 20 | return {"task_count": self.task_count} 21 | 22 | def execute(self, env: DaskEnvironment) -> WorkloadExecutionResult: 23 | from distributed import Client 24 | 25 | def run(client: Client): 26 | tasks = [client.submit(empty, pure=False) for _ in range(self.task_count)] 27 | client.gather(tasks) 28 | 29 | return measure_dask_tasks(env, run) 30 | -------------------------------------------------------------------------------- /benchmarks/src/workloads/sleep_resources.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from typing import Any, Dict 3 | 4 | from ..environment.hq import HqEnvironment 5 | from .utils import measure_hq_tasks 6 | from .workload import Workload, WorkloadExecutionResult 7 | 8 | 9 | class SleepWithResources(Workload, ABC): 10 | def __init__(self, task_count: int, resources: Dict[str, Any], sleep_duration=0): 11 | self.task_count = task_count 12 | self.resources = resources 13 | self.sleep_duration = sleep_duration 14 | 15 | def parameters(self) -> Dict[str, Any]: 16 | return dict( 17 | task_count=self.task_count, 18 | resources=self.resources, 19 | duration=self.sleep_duration, 20 | ) 21 | 22 | def name(self) -> str: 23 | return "sleep-with-resources" 24 | 25 | 26 | class SleepWithResourcesHQ(SleepWithResources): 27 | def execute(self, env: HqEnvironment) -> WorkloadExecutionResult: 28 | return measure_hq_tasks( 29 | env, 30 | ["sleep", str(self.sleep_duration)], 31 | task_count=self.task_count, 32 | resources=self.resources, 33 | ) 34 | -------------------------------------------------------------------------------- /benchmarks/src/workloads/stress.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | from abc import ABC 3 | from typing import Any, Dict, Optional 4 | 5 | from ..environment import Environment 6 | from ..utils import is_binary_available 7 | from .utils import measure_hq_tasks 8 | from .workload import Workload, WorkloadExecutionResult 9 | 10 | 11 | class Stress(Workload, ABC): 12 | def __init__( 13 | self, 14 | task_count: int, 15 | cpu_count: Optional[int] = None, 16 | stress_duration=1, 17 | ): 18 | self.task_count = task_count 19 | self.cpu_count = cpu_count 20 | self.stress_duration = stress_duration 21 | 22 | def name(self) -> str: 23 | return "stress" 24 | 25 | def parameters(self) -> Dict[str, Any]: 26 | return dict( 27 | task_count=self.task_count, 28 | cpu_count=self.cpu_count, 29 | duration=self.stress_duration, 30 | ) 31 | 32 | def execute(self, env: Environment) -> WorkloadExecutionResult: 33 | assert is_binary_available("stress") 34 | 35 | cpu_count = self.cpu_count or multiprocessing.cpu_count() 36 | return self.compute( 37 | env, 38 | task_count=self.task_count, 39 | cpu_count=cpu_count, 40 | stress_duration=self.stress_duration, 41 | ) 42 | 43 | def compute( 44 | self, env: Environment, task_count: int, cpu_count: int, stress_duration: int 45 | ) -> WorkloadExecutionResult: 46 | raise NotImplementedError 47 | 48 | 49 | class StressHQ(Stress): 50 | def compute( 51 | self, env: Environment, task_count: int, cpu_count: int, stress_duration: int 52 | ) -> WorkloadExecutionResult: 53 | return measure_hq_tasks( 54 | env, 55 | ["stress", "--cpu", str(cpu_count), "--timeout", str(stress_duration)], 56 | task_count=task_count, 57 | cpus_per_task=cpu_count, 58 | ) 59 | -------------------------------------------------------------------------------- /benchmarks/src/workloads/workload.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | from typing import Any, Dict 3 | 4 | from ..environment import Environment 5 | 6 | 7 | @dataclasses.dataclass 8 | class WorkloadExecutionResult: 9 | duration: float 10 | 11 | 12 | class Workload: 13 | def name(self) -> str: 14 | raise NotImplementedError 15 | 16 | def parameters(self) -> Dict[str, Any]: 17 | raise NotImplementedError 18 | 19 | def execute(self, env: Environment) -> WorkloadExecutionResult: 20 | raise NotImplementedError 21 | -------------------------------------------------------------------------------- /benchmarks/sw_upload.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from pathlib import Path 3 | from typing import Any, Dict 4 | 5 | import typer 6 | from src.utils import load_database 7 | from swclient.client import Client, Measurement 8 | 9 | app = typer.Typer() 10 | 11 | 12 | def unpack_dict(dictionary: Dict[str, Any]) -> Dict[str, str]: 13 | result = {} 14 | for key, value in dictionary.items(): 15 | if isinstance(value, dict): 16 | value = unpack_dict(value) 17 | for k, v in value.items(): 18 | result[f"{key}/{k}"] = v 19 | else: 20 | result[key] = value 21 | return result 22 | 23 | 24 | def prefix_dict(dictionary: Dict[str, Any], prefix: str) -> Dict[str, Any]: 25 | return {f"{prefix}/{k}": str(v) for (k, v) in dictionary.items()} 26 | 27 | 28 | def normalize_dict(dictionary: Dict[str, Any], prefix: str) -> Dict[str, Any]: 29 | return prefix_dict(unpack_dict(dictionary), prefix=prefix) 30 | 31 | 32 | @app.command() 33 | def upload( 34 | database_path: Path = typer.Argument(..., exists=True), 35 | token: str = typer.Option(...), 36 | ): 37 | client = Client("https://snailwatch.it4i.cz/api", token) 38 | database = load_database(database_path) 39 | measurements = [] 40 | 41 | for record in database.records: 42 | timestamp = datetime.fromtimestamp(record.timestamp) 43 | measurement = Measurement( 44 | benchmark=record.workload, 45 | environment=dict( 46 | **normalize_dict(record.workload_params, "workload"), 47 | **normalize_dict(record.environment_params, "env"), 48 | env=record.environment_params, 49 | **normalize_dict(record.benchmark_metadata, "metadata"), 50 | ), 51 | result=dict(duration=dict(type="time", value=record.duration)), 52 | timestamp=timestamp, 53 | ) 54 | measurements.append(measurement) 55 | 56 | client.upload_measurements(measurements) 57 | 58 | 59 | if __name__ == "__main__": 60 | app() 61 | -------------------------------------------------------------------------------- /crates/hyperqueue/benches/benchmark.rs: -------------------------------------------------------------------------------- 1 | use criterion::measurement::WallTime; 2 | use criterion::{BenchmarkGroup, Criterion, criterion_group, criterion_main}; 3 | use hyperqueue::common::placeholders::{has_placeholders, parse_resolvable_string}; 4 | 5 | fn bench_parse_placeholder(c: &mut BenchmarkGroup) { 6 | c.bench_function("no placeholders", |bencher| { 7 | bencher.iter(|| { 8 | parse_resolvable_string("/tmp/my-very-long-path/that-is-even-longer-than-we-thought") 9 | }); 10 | }); 11 | c.bench_function("single placeholder", |bencher| { 12 | bencher.iter(|| { 13 | parse_resolvable_string( 14 | "/tmp/my-very-long-path/%{TASK_ID}/that-is-even-longer-than-we-thought", 15 | ) 16 | }); 17 | }); 18 | c.bench_function("has_placeholders without placeholder", |bencher| { 19 | bencher.iter(|| { 20 | has_placeholders("/tmp/my-very-long-path/that-is-even-longer-than-we-thought") 21 | }); 22 | }); 23 | c.bench_function("has_placeholders with placeholder", |bencher| { 24 | bencher.iter(|| { 25 | has_placeholders( 26 | "/tmp/my-very-long-path/that-is-even-longer-than-we-thought/%{TASK_ID}", 27 | ) 28 | }); 29 | }); 30 | } 31 | 32 | pub fn benchmark_placeholders(c: &mut Criterion) { 33 | let mut group = c.benchmark_group("placeholder"); 34 | bench_parse_placeholder(&mut group); 35 | } 36 | 37 | criterion_group!(placeholders, benchmark_placeholders); 38 | 39 | criterion_main!(placeholders); 40 | -------------------------------------------------------------------------------- /crates/hyperqueue/src/client/autoalloc.rs: -------------------------------------------------------------------------------- 1 | use crate::server::autoalloc::AllocationState; 2 | 3 | impl AllocationState { 4 | pub fn is_failed(&self) -> bool { 5 | match self { 6 | AllocationState::Finished { 7 | disconnected_workers, 8 | .. 9 | } => disconnected_workers.all_crashed(), 10 | AllocationState::FinishedUnexpectedly { failed, .. } => *failed, 11 | AllocationState::Queued { .. } | AllocationState::Running { .. } => false, 12 | } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /crates/hyperqueue/src/client/commands/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod autoalloc; 2 | pub mod data; 3 | pub mod doc; 4 | pub mod job; 5 | pub mod journal; 6 | pub mod outputlog; 7 | pub mod server; 8 | pub mod submit; 9 | pub mod wait; 10 | pub mod worker; 11 | 12 | /// Helper macro for generating CLI help for a `Duration` (or `Option`) value 13 | /// that can be specified either using the HMS or humantime formats. 14 | macro_rules! duration_doc { 15 | ($text:expr) => { 16 | concat!( 17 | $text, 18 | "\n\n", 19 | r#"You can use either the `HH:MM:SS` format or a "humantime" format. 20 | For example: 21 | - 01:00:00 => 1 hour 22 | - 02:05:10 => 2 hours, 5 minutes, 10 seconds 23 | - 1h => 1 hour 24 | - 2h5m10s => 2 hours, 5 minutes, 10 seconds 25 | - 3h 10m 5s => 3 hours, 10 minutes, 5 seconds 26 | - 2 hours 5 minutes => 2 hours, 5 minutes"# 27 | ) 28 | }; 29 | } 30 | 31 | pub(crate) use duration_doc; 32 | -------------------------------------------------------------------------------- /crates/hyperqueue/src/client/commands/submit/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod command; 2 | pub mod defs; 3 | pub mod directives; 4 | mod jobfile; 5 | 6 | pub use command::SubmitJobTaskConfOpts; 7 | pub use command::{JobSubmitOpts, submit_computation}; 8 | 9 | pub use jobfile::{JobSubmitFileOpts, submit_computation_from_job_file}; 10 | -------------------------------------------------------------------------------- /crates/hyperqueue/src/client/globalsettings.rs: -------------------------------------------------------------------------------- 1 | use crate::client::output::outputs::Output; 2 | use std::path::{Path, PathBuf}; 3 | 4 | pub struct GlobalSettings { 5 | server_dir: PathBuf, 6 | printer: Box, 7 | } 8 | 9 | impl GlobalSettings { 10 | pub fn new(server_dir: PathBuf, printer: Box) -> Self { 11 | GlobalSettings { 12 | server_dir, 13 | printer, 14 | } 15 | } 16 | 17 | pub fn server_directory(&self) -> &Path { 18 | &self.server_dir 19 | } 20 | 21 | pub fn printer(&self) -> &dyn Output { 22 | self.printer.as_ref() 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /crates/hyperqueue/src/client/job.rs: -------------------------------------------------------------------------------- 1 | use crate::rpc_call; 2 | use crate::transfer::connection::ClientSession; 3 | use crate::transfer::messages::{FromClientMessage, ToClientMessage}; 4 | use tako::{Map, WorkerId}; 5 | 6 | /// Maps worker IDs to hostnames. 7 | pub type WorkerMap = Map; 8 | 9 | pub async fn get_worker_map(session: &mut ClientSession) -> anyhow::Result { 10 | let message = FromClientMessage::WorkerList; 11 | let response = 12 | rpc_call!(session.connection(), message, ToClientMessage::WorkerListResponse(r) => r) 13 | .await?; 14 | let map = response 15 | .workers 16 | .into_iter() 17 | .map(|w| (w.id, w.configuration.hostname)) 18 | .collect(); 19 | Ok(map) 20 | } 21 | -------------------------------------------------------------------------------- /crates/hyperqueue/src/client/mod.rs: -------------------------------------------------------------------------------- 1 | use std::path::PathBuf; 2 | 3 | pub mod autoalloc; 4 | pub mod commands; 5 | pub mod globalsettings; 6 | pub mod job; 7 | pub mod output; 8 | pub mod resources; 9 | pub mod server; 10 | pub mod status; 11 | pub mod task; 12 | pub mod utils; 13 | 14 | pub fn default_server_directory_path() -> PathBuf { 15 | let mut home = dirs::home_dir().unwrap_or_else(std::env::temp_dir); 16 | home.push(".hq-server"); 17 | home 18 | } 19 | -------------------------------------------------------------------------------- /crates/hyperqueue/src/client/output/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod cli; 2 | mod common; 3 | pub use common::{Verbosity, VerbosityFlag, resolve_task_paths}; 4 | pub mod json; 5 | pub mod outputs; 6 | pub mod quiet; 7 | -------------------------------------------------------------------------------- /crates/hyperqueue/src/client/server.rs: -------------------------------------------------------------------------------- 1 | use crate::transfer::connection::ClientConnection; 2 | use crate::transfer::messages::FromClientMessage; 3 | 4 | pub async fn client_stop_server(connection: &mut ClientConnection) -> crate::Result<()> { 5 | connection.send(FromClientMessage::Stop).await?; 6 | log::info!("Stopping server"); 7 | Ok(()) 8 | } 9 | -------------------------------------------------------------------------------- /crates/hyperqueue/src/client/status.rs: -------------------------------------------------------------------------------- 1 | use serde::Deserialize; 2 | use serde::Serialize; 3 | 4 | use crate::server::job::JobTaskState; 5 | use crate::transfer::messages::JobInfo; 6 | 7 | #[derive(clap::ValueEnum, Clone, Copy, Serialize, Deserialize, Debug, Eq, PartialEq, Hash)] 8 | pub enum Status { 9 | Waiting, 10 | Running, 11 | Finished, 12 | Failed, 13 | Canceled, 14 | Opened, 15 | } 16 | 17 | pub fn job_status(info: &JobInfo) -> Status { 18 | let has_waiting = info.counters.n_waiting_tasks(info.n_tasks) > 0; 19 | 20 | if info.counters.n_running_tasks > 0 { 21 | Status::Running 22 | } else if has_waiting { 23 | Status::Waiting 24 | } else if info.counters.n_failed_tasks > 0 { 25 | Status::Failed 26 | } else if info.counters.n_canceled_tasks > 0 { 27 | Status::Canceled 28 | } else { 29 | assert_eq!(info.counters.n_finished_tasks, info.n_tasks); 30 | if info.is_open { 31 | Status::Opened 32 | } else { 33 | Status::Finished 34 | } 35 | } 36 | } 37 | 38 | pub fn is_terminated(info: &JobInfo) -> bool { 39 | info.counters.n_running_tasks == 0 && info.counters.n_waiting_tasks(info.n_tasks) == 0 40 | } 41 | 42 | #[inline] 43 | pub fn get_task_status(status: &JobTaskState) -> Status { 44 | match status { 45 | JobTaskState::Waiting => Status::Waiting, 46 | JobTaskState::Running { .. } => Status::Running, 47 | JobTaskState::Finished { .. } => Status::Finished, 48 | JobTaskState::Failed { .. } => Status::Failed, 49 | JobTaskState::Canceled { .. } => Status::Canceled, 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /crates/hyperqueue/src/client/utils.rs: -------------------------------------------------------------------------------- 1 | use clap::builder::TypedValueParser; 2 | use clap::{Command, Error}; 3 | use std::ffi::OsStr; 4 | 5 | #[macro_export] 6 | macro_rules! rpc_call { 7 | ($conn:expr, $message:expr, $matcher:pat $(=> $result:expr)?) => { 8 | async { 9 | match $conn.send_and_receive($message).await? { 10 | $matcher => $crate::Result::Ok(($($result),*)), 11 | $crate::transfer::messages::ToClientMessage::Error(e) => { 12 | $crate::common::error::error(format!("{}", e)) 13 | } 14 | msg => { 15 | $crate::common::error::error(format!("Received an invalid message {:?}", msg)) 16 | } 17 | } 18 | } 19 | }; 20 | } 21 | 22 | /// This argument checks that the input can be parsed as `Arg`. 23 | /// If it is, it will return the original input from the command line as a [`String`] along with the 24 | /// parsed value. 25 | #[derive(Debug, Clone)] 26 | pub struct PassThroughArgument(String, Arg); 27 | 28 | impl PassThroughArgument { 29 | pub fn into_original_input(self) -> String { 30 | self.0 31 | } 32 | 33 | pub fn as_parsed_arg(&self) -> &Arg { 34 | &self.1 35 | } 36 | 37 | pub fn into_parsed_arg(self) -> Arg { 38 | self.1 39 | } 40 | } 41 | 42 | #[derive(Clone)] 43 | pub struct PassthroughParser(fn(&str) -> anyhow::Result); 44 | 45 | /// Creates a new parser that passed the original value through, while checking that `Arg` 46 | /// can be parsed successfully. 47 | pub fn passthrough_parser(parser: fn(&str) -> anyhow::Result) -> PassthroughParser { 48 | PassthroughParser(parser) 49 | } 50 | 51 | impl TypedValueParser for PassthroughParser { 52 | type Value = PassThroughArgument; 53 | 54 | fn parse_ref( 55 | &self, 56 | cmd: &Command, 57 | arg: Option<&clap::Arg>, 58 | value: &OsStr, 59 | ) -> Result { 60 | self.0 61 | .parse_ref(cmd, arg, value) 62 | .map(|parsed| PassThroughArgument(value.to_string_lossy().to_string(), parsed)) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /crates/hyperqueue/src/common/env.rs: -------------------------------------------------------------------------------- 1 | use bstr::BString; 2 | 3 | const HQ_ENV_PREFIX: &str = "HQ_"; 4 | 5 | macro_rules! create_hq_env { 6 | ($name: literal) => { 7 | concat!("HQ_", $name) 8 | }; 9 | } 10 | 11 | pub fn is_hq_env(name: &BString) -> bool { 12 | name.starts_with(HQ_ENV_PREFIX.as_bytes()) 13 | } 14 | 15 | /// Known environment variables 16 | pub const HQ_JOB_ID: &str = create_hq_env!("JOB_ID"); 17 | pub const HQ_TASK_ID: &str = create_hq_env!("TASK_ID"); 18 | pub const HQ_INSTANCE_ID: &str = create_hq_env!("INSTANCE_ID"); 19 | pub const HQ_SUBMIT_DIR: &str = create_hq_env!("SUBMIT_DIR"); 20 | pub const HQ_ENTRY: &str = create_hq_env!("ENTRY"); 21 | pub const HQ_PIN: &str = create_hq_env!("PIN"); 22 | pub const HQ_TASK_DIR: &str = create_hq_env!("TASK_DIR"); 23 | pub const HQ_ERROR_FILENAME: &str = create_hq_env!("ERROR_FILENAME"); 24 | pub const HQ_CPUS: &str = create_hq_env!("CPUS"); 25 | pub const HQ_NODE_FILE: &str = create_hq_env!("NODE_FILE"); 26 | pub const HQ_HOST_FILE: &str = create_hq_env!("HOST_FILE"); 27 | pub const HQ_NUM_NODES: &str = create_hq_env!("NUM_NODES"); 28 | pub const HQ_DATA_ACCESS: &str = create_hq_env!("DATA_ACCESS"); 29 | -------------------------------------------------------------------------------- /crates/hyperqueue/src/common/error.rs: -------------------------------------------------------------------------------- 1 | use thiserror::Error; 2 | 3 | use crate::common::error::HqError::GenericError; 4 | 5 | #[derive(Debug, Error)] 6 | pub enum HqError { 7 | #[error(transparent)] 8 | IoError(#[from] std::io::Error), 9 | #[error("Serialization error: {0}")] 10 | SerializationError(String), 11 | #[error("Deserialization error: {0}")] 12 | DeserializationError(String), 13 | #[error("Tako error: {0}")] 14 | TakoError(#[from] tako::Error), 15 | #[error("Version error: {0}")] 16 | VersionError(String), 17 | #[error("Error: {0}")] 18 | GenericError(String), 19 | } 20 | 21 | impl From for HqError { 22 | fn from(e: serde_json::error::Error) -> Self { 23 | Self::SerializationError(e.to_string()) 24 | } 25 | } 26 | 27 | impl From for HqError { 28 | fn from(e: bincode::Error) -> Self { 29 | Self::SerializationError(e.to_string()) 30 | } 31 | } 32 | 33 | impl From for HqError { 34 | fn from(error: anyhow::Error) -> Self { 35 | Self::GenericError(error.to_string()) 36 | } 37 | } 38 | 39 | impl From for HqError { 40 | fn from(error: toml::de::Error) -> Self { 41 | Self::DeserializationError(error.to_string()) 42 | } 43 | } 44 | 45 | pub fn error(message: String) -> crate::Result { 46 | Err(GenericError(message)) 47 | } 48 | 49 | impl From for HqError { 50 | fn from(e: String) -> Self { 51 | GenericError(e) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /crates/hyperqueue/src/common/idcounter.rs: -------------------------------------------------------------------------------- 1 | #[derive(Copy, Clone, Default, Debug)] 2 | pub struct IdCounter { 3 | counter: u32, 4 | } 5 | impl IdCounter { 6 | pub fn new(initial_value: u32) -> Self { 7 | Self { 8 | counter: initial_value, 9 | } 10 | } 11 | pub fn increment(&mut self) -> u32 { 12 | let value = self.counter; 13 | self.counter += 1; 14 | value 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /crates/hyperqueue/src/common/manager/common.rs: -------------------------------------------------------------------------------- 1 | use std::time::Duration; 2 | 3 | /// Format a duration as a PBS/Slurm time string, e.g. 01:05:02 4 | pub(super) fn format_duration(duration: &Duration) -> String { 5 | let mut seconds = duration.as_secs(); 6 | let hours = seconds / 3600; 7 | seconds %= 3600; 8 | let minutes = seconds / 60; 9 | seconds %= 60; 10 | format!("{hours:02}:{minutes:02}:{seconds:02}") 11 | } 12 | 13 | #[cfg(test)] 14 | mod test { 15 | use super::format_duration; 16 | use std::time::Duration; 17 | 18 | #[test] 19 | fn test_format_duration() { 20 | assert_eq!(format_duration(&Duration::from_secs(0)), "00:00:00"); 21 | assert_eq!(format_duration(&Duration::from_secs(1)), "00:00:01"); 22 | assert_eq!(format_duration(&Duration::from_secs(61)), "00:01:01"); 23 | assert_eq!(format_duration(&Duration::from_secs(3661)), "01:01:01"); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /crates/hyperqueue/src/common/manager/info.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | use std::fmt::{Display, Formatter}; 3 | use std::time::Duration; 4 | use tako::worker::WorkerConfiguration; 5 | 6 | pub const WORKER_EXTRA_MANAGER_KEY: &str = "JobManager"; 7 | 8 | #[derive(Clone, Serialize, Deserialize, Debug, PartialEq)] 9 | pub enum ManagerType { 10 | Pbs, 11 | Slurm, 12 | } 13 | 14 | impl Display for ManagerType { 15 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 16 | match self { 17 | ManagerType::Pbs => f.write_str("PBS"), 18 | ManagerType::Slurm => f.write_str("SLURM"), 19 | } 20 | } 21 | } 22 | 23 | #[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] 24 | pub struct ManagerInfo { 25 | pub manager: ManagerType, 26 | pub allocation_id: String, 27 | /// Time that remains until the job ends 28 | pub time_limit: Option, 29 | } 30 | 31 | impl ManagerInfo { 32 | pub fn new(manager: ManagerType, job_id: String, time_limit: Option) -> Self { 33 | Self { 34 | manager, 35 | allocation_id: job_id, 36 | time_limit, 37 | } 38 | } 39 | } 40 | 41 | pub trait GetManagerInfo { 42 | fn get_manager_info(&self) -> Option; 43 | } 44 | 45 | impl GetManagerInfo for WorkerConfiguration { 46 | fn get_manager_info(&self) -> Option { 47 | self.extra 48 | .get(WORKER_EXTRA_MANAGER_KEY) 49 | .and_then(|info| serde_json::from_str(info).ok()) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /crates/hyperqueue/src/common/manager/mod.rs: -------------------------------------------------------------------------------- 1 | mod common; 2 | pub mod info; 3 | pub mod pbs; 4 | pub mod slurm; 5 | -------------------------------------------------------------------------------- /crates/hyperqueue/src/common/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod arraydef; 2 | pub mod arrayparser; 3 | pub mod cli; 4 | pub mod env; 5 | pub mod error; 6 | pub mod format; 7 | pub mod idcounter; 8 | pub mod manager; 9 | pub mod parser; 10 | pub mod parser2; 11 | pub mod placeholders; 12 | pub mod rpc; 13 | pub mod serialization; 14 | pub mod serverdir; 15 | pub mod setup; 16 | pub mod utils; 17 | -------------------------------------------------------------------------------- /crates/hyperqueue/src/common/rpc.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::{Debug, Formatter}; 2 | use tokio::sync::oneshot::Receiver; 3 | use tokio::sync::{mpsc, oneshot}; 4 | 5 | /// Can be used to respond to a RPC call. 6 | #[must_use = "response token should be used to respond to a request"] 7 | pub struct ResponseToken { 8 | sender: oneshot::Sender, 9 | } 10 | 11 | impl Debug for ResponseToken { 12 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 13 | f.write_str("Response token") 14 | } 15 | } 16 | 17 | impl ResponseToken { 18 | pub fn new() -> (ResponseToken, Receiver) { 19 | let (tx, rx) = oneshot::channel::(); 20 | (Self { sender: tx }, rx) 21 | } 22 | 23 | pub fn respond(self, response: T) { 24 | if let Err(_e) = self.sender.send(response) { 25 | log::warn!("Could not send response to RPC method, the other end hang up"); 26 | } 27 | } 28 | } 29 | 30 | /// Helper function for creating request-response RPC calls. 31 | /// Expects a callback that will receive a response token. 32 | /// This function will return once the response token has been resolved. 33 | pub fn initiate_request(make_request: F) -> oneshot::Receiver 34 | where 35 | F: FnOnce(ResponseToken) -> Result<(), mpsc::error::SendError>, 36 | R: std::fmt::Debug, 37 | { 38 | let (token, rx) = ResponseToken::new(); 39 | if let Err(error) = make_request(token) { 40 | log::warn!("Could not make RPC request: {error:?}"); 41 | } 42 | rx 43 | } 44 | 45 | pub type RpcSender = mpsc::UnboundedSender; 46 | pub type RpcReceiver = mpsc::UnboundedReceiver; 47 | 48 | pub fn make_rpc_queue() -> (RpcSender, RpcReceiver) { 49 | mpsc::unbounded_channel() 50 | } 51 | -------------------------------------------------------------------------------- /crates/hyperqueue/src/common/serialization.rs: -------------------------------------------------------------------------------- 1 | use bincode::Options; 2 | use serde::de::DeserializeOwned; 3 | use serde::{Deserialize, Serialize}; 4 | use std::fmt::{Debug, Formatter}; 5 | use std::marker::PhantomData; 6 | 7 | /// Helper trait to configure serialization options via separate types. 8 | pub trait SerializationConfig { 9 | fn config() -> impl Options; 10 | } 11 | 12 | pub struct DefaultConfig; 13 | 14 | impl SerializationConfig for DefaultConfig { 15 | fn config() -> impl Options { 16 | bincode::DefaultOptions::new().with_limit(tako::MAX_FRAME_SIZE as u64) 17 | } 18 | } 19 | 20 | pub struct TrailingAllowedConfig; 21 | 22 | impl SerializationConfig for TrailingAllowedConfig { 23 | fn config() -> impl Options { 24 | bincode::DefaultOptions::new() 25 | .allow_trailing_bytes() 26 | .with_limit(tako::MAX_FRAME_SIZE as u64) 27 | } 28 | } 29 | 30 | /// Strongly typed wrapper over `` serialized with Bincode. 31 | #[derive(Serialize, Deserialize)] 32 | pub struct Serialized { 33 | #[serde(with = "serde_bytes")] 34 | data: Box<[u8]>, 35 | _phantom: PhantomData<(T, C)>, 36 | } 37 | 38 | impl Debug for Serialized { 39 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 40 | write!( 41 | f, 42 | "Serialized {} ({}) byte(s)", 43 | std::any::type_name::(), 44 | self.data.len() 45 | ) 46 | } 47 | } 48 | 49 | impl Clone for Serialized { 50 | fn clone(&self) -> Self { 51 | Self { 52 | data: self.data.clone(), 53 | _phantom: PhantomData, 54 | } 55 | } 56 | } 57 | 58 | impl Serialized { 59 | pub fn new(value: &T) -> bincode::Result { 60 | let result = C::config().serialize(value)?; 61 | // Check that we're not reallocating needlessly in `into_boxed_slice` 62 | debug_assert_eq!(result.capacity(), result.len()); 63 | Ok(Self { 64 | data: result.into_boxed_slice(), 65 | _phantom: Default::default(), 66 | }) 67 | } 68 | 69 | pub fn deserialize(&self) -> bincode::Result { 70 | C::config().deserialize(&self.data) 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /crates/hyperqueue/src/common/setup.rs: -------------------------------------------------------------------------------- 1 | use env_logger::DEFAULT_FILTER_ENV; 2 | use env_logger::fmt::style::{AnsiColor, Color, Style}; 3 | use log::LevelFilter; 4 | use std::io::Write; 5 | 6 | /// Sets the behavior of the logger, based on passed environment variables 7 | /// such as `RUST_LOG`. 8 | pub fn setup_logging(verbose: bool) { 9 | let mut builder = env_logger::Builder::default(); 10 | builder.filter_level(if verbose { 11 | LevelFilter::Debug 12 | } else { 13 | LevelFilter::Info 14 | }); 15 | 16 | let has_debug = std::env::var(DEFAULT_FILTER_ENV) 17 | .map(|v| v.contains("debug")) 18 | .unwrap_or(false); 19 | 20 | if verbose || has_debug { 21 | builder.format_timestamp_millis(); 22 | } else { 23 | // Shortened format 24 | //