├── .dockerignore
├── .github
    ├── renovate.json5
    └── workflows
    │   ├── build.yml
    │   ├── deploy_docs.yml
    │   ├── nightly.yml
    │   ├── publish_container.yml
    │   ├── release.yml
    │   └── test.yml
├── .gitignore
├── CHANGELOG.md
├── Cargo.lock
├── Cargo.toml
├── Dockerfile
├── LICENSE
├── README.md
├── benchmarks
    ├── README.md
    ├── experiment-alternative-resources.py
    ├── experiment-dask.py
    ├── experiment-encryption-overhead.py
    ├── experiment-fractional-resources.py
    ├── experiment-io-streaming.py
    ├── experiment-numa.py
    ├── experiment-per-task-overhead.py
    ├── experiment-scalability-stress.py
    ├── experiment-scalability.py
    ├── experiment-server-cpu-util.py
    ├── experiment-total-overhead.py
    ├── main.py
    ├── postprocess.py
    ├── requirements.txt
    ├── src
    │   ├── __init__.py
    │   ├── analysis
    │   │   ├── chart.py
    │   │   └── dataframe.py
    │   ├── benchmark
    │   │   ├── __init__.py
    │   │   ├── database.py
    │   │   ├── identifier.py
    │   │   ├── result.py
    │   │   └── runner.py
    │   ├── benchmark_defs.py
    │   ├── build
    │   │   ├── hq.py
    │   │   └── repository.py
    │   ├── cli.py
    │   ├── clusterutils
    │   │   ├── __init__.py
    │   │   ├── cluster_helper.py
    │   │   ├── node_list.py
    │   │   └── profiler.py
    │   ├── environment
    │   │   ├── __init__.py
    │   │   ├── dask.py
    │   │   ├── hq.py
    │   │   ├── snake.py
    │   │   └── utils.py
    │   ├── executor
    │   │   ├── executor.py
    │   │   ├── executor_script.py
    │   │   ├── external_executor.py
    │   │   ├── local_executor.py
    │   │   └── serialization.py
    │   ├── monitoring
    │   │   ├── monitor_script.py
    │   │   └── record.py
    │   ├── postprocessing
    │   │   ├── common.py
    │   │   ├── monitor.py
    │   │   ├── overview.py
    │   │   ├── report.py
    │   │   ├── serve.py
    │   │   └── templates
    │   │   │   ├── benchmark.html
    │   │   │   ├── compare_table.html
    │   │   │   ├── main.html
    │   │   │   ├── summary.html
    │   │   │   └── workload.html
    │   ├── submit
    │   │   ├── execute_script.py
    │   │   ├── options.py
    │   │   ├── slurm.py
    │   │   ├── submit.py
    │   │   └── utils.py
    │   ├── trace
    │   │   └── export.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   ├── benchmark.py
    │   │   ├── io.py
    │   │   ├── process.py
    │   │   └── timing.py
    │   └── workloads
    │   │   ├── __init__.py
    │   │   ├── empty.py
    │   │   ├── sleep.py
    │   │   ├── sleep_resources.py
    │   │   ├── stress.py
    │   │   ├── utils.py
    │   │   └── workload.py
    └── sw_upload.py
├── crates
    ├── hyperqueue
    │   ├── Cargo.toml
    │   ├── benches
    │   │   └── benchmark.rs
    │   └── src
    │   │   ├── bin
    │   │       └── hq.rs
    │   │   ├── client
    │   │       ├── autoalloc.rs
    │   │       ├── commands
    │   │       │   ├── autoalloc.rs
    │   │       │   ├── data.rs
    │   │       │   ├── doc.rs
    │   │       │   ├── job.rs
    │   │       │   ├── journal
    │   │       │   │   ├── mod.rs
    │   │       │   │   └── output.rs
    │   │       │   ├── mod.rs
    │   │       │   ├── outputlog.rs
    │   │       │   ├── server.rs
    │   │       │   ├── submit
    │   │       │   │   ├── command.rs
    │   │       │   │   ├── defs.rs
    │   │       │   │   ├── directives.rs
    │   │       │   │   ├── jobfile.rs
    │   │       │   │   └── mod.rs
    │   │       │   ├── wait.rs
    │   │       │   └── worker.rs
    │   │       ├── globalsettings.rs
    │   │       ├── job.rs
    │   │       ├── mod.rs
    │   │       ├── output
    │   │       │   ├── cli.rs
    │   │       │   ├── common.rs
    │   │       │   ├── json.rs
    │   │       │   ├── mod.rs
    │   │       │   ├── outputs.rs
    │   │       │   └── quiet.rs
    │   │       ├── resources.rs
    │   │       ├── server.rs
    │   │       ├── status.rs
    │   │       ├── task.rs
    │   │       └── utils.rs
    │   │   ├── common
    │   │       ├── arraydef.rs
    │   │       ├── arrayparser.rs
    │   │       ├── cli.rs
    │   │       ├── env.rs
    │   │       ├── error.rs
    │   │       ├── format.rs
    │   │       ├── idcounter.rs
    │   │       ├── manager
    │   │       │   ├── common.rs
    │   │       │   ├── info.rs
    │   │       │   ├── mod.rs
    │   │       │   ├── pbs.rs
    │   │       │   └── slurm.rs
    │   │       ├── mod.rs
    │   │       ├── parser.rs
    │   │       ├── parser2.rs
    │   │       ├── placeholders.rs
    │   │       ├── rpc.rs
    │   │       ├── serialization.rs
    │   │       ├── serverdir.rs
    │   │       ├── setup.rs
    │   │       └── utils
    │   │       │   ├── controlflow.rs
    │   │       │   ├── fs.rs
    │   │       │   ├── mod.rs
    │   │       │   ├── network.rs
    │   │       │   ├── str.rs
    │   │       │   └── time.rs
    │   │   ├── dashboard
    │   │       ├── data
    │   │       │   ├── data.rs
    │   │       │   ├── fetch.rs
    │   │       │   ├── mod.rs
    │   │       │   ├── time_based_vec.rs
    │   │       │   ├── time_interval.rs
    │   │       │   └── timelines
    │   │       │   │   ├── alloc_timeline.rs
    │   │       │   │   ├── job_timeline.rs
    │   │       │   │   ├── mod.rs
    │   │       │   │   └── worker_timeline.rs
    │   │       ├── mod.rs
    │   │       ├── ui
    │   │       │   ├── mod.rs
    │   │       │   ├── screen.rs
    │   │       │   ├── screens
    │   │       │   │   ├── autoalloc
    │   │       │   │   │   ├── alloc_timeline_chart.rs
    │   │       │   │   │   ├── allocations_info_table.rs
    │   │       │   │   │   ├── mod.rs
    │   │       │   │   │   ├── queue_info_table.rs
    │   │       │   │   │   └── queue_params_display.rs
    │   │       │   │   ├── cluster
    │   │       │   │   │   ├── mod.rs
    │   │       │   │   │   ├── overview
    │   │       │   │   │   │   ├── mod.rs
    │   │       │   │   │   │   ├── worker_count_chart.rs
    │   │       │   │   │   │   └── worker_table.rs
    │   │       │   │   │   └── worker
    │   │       │   │   │   │   ├── cpu_util_table.rs
    │   │       │   │   │   │   ├── mod.rs
    │   │       │   │   │   │   ├── worker_config_table.rs
    │   │       │   │   │   │   └── worker_utilization_chart.rs
    │   │       │   │   ├── jobs
    │   │       │   │   │   ├── job_info_display.rs
    │   │       │   │   │   ├── job_tasks_chart.rs
    │   │       │   │   │   ├── jobs_table.rs
    │   │       │   │   │   ├── mod.rs
    │   │       │   │   │   └── overview.rs
    │   │       │   │   ├── mod.rs
    │   │       │   │   └── root_screen.rs
    │   │       │   ├── styles.rs
    │   │       │   ├── terminal.rs
    │   │       │   └── widgets
    │   │       │   │   ├── chart.rs
    │   │       │   │   ├── filled_rectangle.rs
    │   │       │   │   ├── mod.rs
    │   │       │   │   ├── progressbar.rs
    │   │       │   │   ├── table.rs
    │   │       │   │   ├── tasks_table.rs
    │   │       │   │   └── text.rs
    │   │       ├── ui_loop.rs
    │   │       └── utils.rs
    │   │   ├── lib.rs
    │   │   ├── server
    │   │       ├── autoalloc
    │   │       │   ├── config.rs
    │   │       │   ├── mod.rs
    │   │       │   ├── process.rs
    │   │       │   ├── queue
    │   │       │   │   ├── common.rs
    │   │       │   │   ├── mod.rs
    │   │       │   │   ├── pbs.rs
    │   │       │   │   └── slurm.rs
    │   │       │   ├── service.rs
    │   │       │   └── state.rs
    │   │       ├── backend.rs
    │   │       ├── bootstrap.rs
    │   │       ├── client
    │   │       │   ├── autoalloc.rs
    │   │       │   ├── mod.rs
    │   │       │   └── submit.rs
    │   │       ├── event
    │   │       │   ├── journal
    │   │       │   │   ├── mod.rs
    │   │       │   │   ├── prune.rs
    │   │       │   │   ├── read.rs
    │   │       │   │   ├── stream.rs
    │   │       │   │   └── write.rs
    │   │       │   ├── mod.rs
    │   │       │   ├── payload.rs
    │   │       │   └── streamer.rs
    │   │       ├── job.rs
    │   │       ├── mod.rs
    │   │       ├── restore.rs
    │   │       ├── state.rs
    │   │       ├── tako_events.rs
    │   │       └── worker.rs
    │   │   ├── stream
    │   │       ├── mod.rs
    │   │       └── reader
    │   │       │   ├── mod.rs
    │   │       │   └── outputlog.rs
    │   │   ├── tests
    │   │       ├── mod.rs
    │   │       ├── server.rs
    │   │       └── utils.rs
    │   │   ├── transfer
    │   │       ├── auth.rs
    │   │       ├── connection.rs
    │   │       ├── messages.rs
    │   │       ├── mod.rs
    │   │       ├── protocol.rs
    │   │       └── stream.rs
    │   │   └── worker
    │   │       ├── bootstrap.rs
    │   │       ├── hwdetect.rs
    │   │       ├── mod.rs
    │   │       ├── parser.rs
    │   │       ├── start
    │   │           ├── mod.rs
    │   │           └── program.rs
    │   │       └── streamer.rs
    ├── pyhq
    │   ├── Cargo.toml
    │   ├── README.md
    │   ├── pyproject.toml
    │   ├── python
    │   │   └── hyperqueue
    │   │   │   ├── __init__.py
    │   │   │   ├── client.py
    │   │   │   ├── cluster
    │   │   │       └── __init__.py
    │   │   │   ├── common.py
    │   │   │   ├── ffi
    │   │   │       ├── __init__.py
    │   │   │       ├── client.py
    │   │   │       ├── cluster.py
    │   │   │       └── protocol.py
    │   │   │   ├── job.py
    │   │   │   ├── output.py
    │   │   │   ├── task
    │   │   │       ├── __init__.py
    │   │   │       ├── function
    │   │   │       │   ├── __init__.py
    │   │   │       │   └── wrapper.py
    │   │   │       ├── program.py
    │   │   │       └── task.py
    │   │   │   ├── utils
    │   │   │       ├── __init__.py
    │   │   │       ├── package.py
    │   │   │       └── string.py
    │   │   │   ├── validation.py
    │   │   │   └── visualization.py
    │   └── src
    │   │   ├── client
    │   │       ├── job.rs
    │   │       ├── mod.rs
    │   │       └── server.rs
    │   │   ├── cluster
    │   │       ├── mod.rs
    │   │       ├── server.rs
    │   │       └── worker.rs
    │   │   ├── lib.rs
    │   │   ├── marshal.rs
    │   │   └── utils
    │   │       ├── error.rs
    │   │       └── mod.rs
    └── tako
    │   ├── Cargo.toml
    │   ├── README.md
    │   ├── benches
    │       ├── benchmark.rs
    │       ├── benchmarks
    │       │   ├── core.rs
    │       │   ├── mod.rs
    │       │   ├── scheduler.rs
    │       │   └── worker.rs
    │       └── utils
    │       │   └── mod.rs
    │   └── src
    │       ├── comm.rs
    │       ├── connection.rs
    │       ├── control.rs
    │       ├── events.rs
    │       ├── gateway.rs
    │       ├── hwstats.rs
    │       ├── internal
    │           ├── common
    │           │   ├── data_structures.rs
    │           │   ├── error.rs
    │           │   ├── ids.rs
    │           │   ├── index.rs
    │           │   ├── mod.rs
    │           │   ├── resources
    │           │   │   ├── allocation.rs
    │           │   │   ├── amount.rs
    │           │   │   ├── descriptor.rs
    │           │   │   ├── map.rs
    │           │   │   ├── mod.rs
    │           │   │   └── request.rs
    │           │   ├── rpc.rs
    │           │   ├── stablemap.rs
    │           │   ├── taskgroup.rs
    │           │   ├── trace.rs
    │           │   ├── utils.rs
    │           │   └── wrapped.rs
    │           ├── datasrv
    │           │   ├── dataobj.rs
    │           │   ├── datastorage.rs
    │           │   ├── download.rs
    │           │   ├── local_client.rs
    │           │   ├── messages.rs
    │           │   ├── mod.rs
    │           │   ├── test_utils.rs
    │           │   ├── tests.rs
    │           │   ├── upload.rs
    │           │   └── utils.rs
    │           ├── messages
    │           │   ├── auth.rs
    │           │   ├── common.rs
    │           │   ├── mod.rs
    │           │   └── worker.rs
    │           ├── mod.rs
    │           ├── scheduler
    │           │   ├── mod.rs
    │           │   ├── multinode.rs
    │           │   ├── query.rs
    │           │   └── state.rs
    │           ├── server
    │           │   ├── client.rs
    │           │   ├── comm.rs
    │           │   ├── core.rs
    │           │   ├── dataobj.rs
    │           │   ├── dataobjmap.rs
    │           │   ├── explain.rs
    │           │   ├── mod.rs
    │           │   ├── reactor.rs
    │           │   ├── rpc.rs
    │           │   ├── task.rs
    │           │   ├── taskmap.rs
    │           │   ├── worker.rs
    │           │   ├── workergroup.rs
    │           │   ├── workerload.rs
    │           │   └── workermap.rs
    │           ├── tests
    │           │   ├── integration
    │           │   │   ├── mod.rs
    │           │   │   ├── test_basic.rs
    │           │   │   ├── test_resources.rs
    │           │   │   ├── test_secret.rs
    │           │   │   ├── test_worker.rs
    │           │   │   └── utils
    │           │   │   │   ├── api.rs
    │           │   │   │   ├── mod.rs
    │           │   │   │   ├── server.rs
    │           │   │   │   ├── task.rs
    │           │   │   │   └── worker.rs
    │           │   ├── mod.rs
    │           │   ├── test_query.rs
    │           │   ├── test_reactor.rs
    │           │   ├── test_scheduler_mn.rs
    │           │   ├── test_scheduler_sn.rs
    │           │   ├── test_worker.rs
    │           │   └── utils
    │           │   │   ├── env.rs
    │           │   │   ├── mod.rs
    │           │   │   ├── resources.rs
    │           │   │   ├── schedule.rs
    │           │   │   ├── shared.rs
    │           │   │   ├── task.rs
    │           │   │   ├── worker.rs
    │           │   │   └── workflows.rs
    │           ├── transfer
    │           │   ├── auth.rs
    │           │   ├── mod.rs
    │           │   └── transport.rs
    │           └── worker
    │           │   ├── comm.rs
    │           │   ├── configuration.rs
    │           │   ├── data
    │           │       ├── download.rs
    │           │       ├── localcomm.rs
    │           │       ├── mod.rs
    │           │       └── upload.rs
    │           │   ├── hwmonitor
    │           │       ├── amd.rs
    │           │       ├── mod.rs
    │           │       └── nvidia.rs
    │           │   ├── localcomm.rs
    │           │   ├── mod.rs
    │           │   ├── reactor.rs
    │           │   ├── resources
    │           │       ├── allocator.rs
    │           │       ├── concise.rs
    │           │       ├── map.rs
    │           │       ├── mod.rs
    │           │       └── pool.rs
    │           │   ├── rpc.rs
    │           │   ├── rqueue.rs
    │           │   ├── state.rs
    │           │   ├── task.rs
    │           │   ├── task_comm.rs
    │           │   └── test_util.rs
    │       ├── launcher.rs
    │       ├── lib.rs
    │       └── program.rs
├── docs
    ├── README.md
    ├── changelog.md
    ├── cheatsheet.md
    ├── cli-reference
    │   └── .gitkeep
    ├── cli
    │   ├── dashboard.md
    │   ├── output-mode.md
    │   └── shortcuts.md
    ├── deployment
    │   ├── allocation.md
    │   ├── cloud.md
    │   ├── index.md
    │   ├── server.md
    │   └── worker.md
    ├── faq.md
    ├── imgs
    │   ├── architecture-bg.png
    │   ├── architecture.png
    │   ├── architecture.svg
    │   ├── cheatsheet.png
    │   ├── cheatsheet.svg
    │   ├── dashboard.gif
    │   ├── hq-comparison-table.png
    │   ├── hq.png
    │   ├── schema.png
    │   ├── schema.svg
    │   ├── streaming.png
    │   └── streaming.svg
    ├── installation.md
    ├── jobs
    │   ├── arrays.md
    │   ├── cresources.md
    │   ├── directives.md
    │   ├── explain.md
    │   ├── failure.md
    │   ├── jobfile.md
    │   ├── jobs.md
    │   ├── multinode.md
    │   ├── openjobs.md
    │   ├── resources.md
    │   └── streaming.md
    ├── other-tools.md
    ├── overrides
    │   └── main.html
    ├── python
    │   ├── client.md
    │   ├── dependencies.md
    │   ├── index.md
    │   └── submit.md
    ├── quickstart.md
    ├── requirements.txt
    └── stylesheets
    │   └── extra.css
├── examples
    ├── README.md
    └── iterative-computation
    │   └── README.md
├── mkdocs.yml
├── nedoc.conf
├── pytest.ini
├── ruff.toml
├── scripts
    ├── bless_tests.sh
    ├── check.sh
    ├── check_package_versions.py
    ├── docs
    │   ├── build_cli_reference.py
    │   └── copy_examples.py
    ├── extract_changelog.py
    ├── get_docs_version.py
    └── print_vers.py
└── tests
    ├── README.md
    ├── __init__.py
    ├── autoalloc
        ├── __init__.py
        ├── conftest.py
        ├── flavor.py
        ├── mock
        │   ├── __init__.py
        │   ├── manager.py
        │   ├── mock.py
        │   ├── pbs.py
        │   └── slurm.py
        ├── test_autoalloc.py
        ├── test_cli.py
        ├── test_dryrun.py
        ├── test_native.py
        └── utils.py
    ├── conftest.py
    ├── job
        ├── __init__.py
        ├── test_file_cleanup.py
        ├── test_job_cat.py
        └── test_job_forget.py
    ├── output
        ├── __init__.py
        ├── test_json.py
        └── test_quiet.py
    ├── pyapi
        ├── __init__.py
        ├── binding
        │   ├── __init__.py
        │   └── test_server.py
        ├── test_cluster.py
        ├── test_dependencies.py
        ├── test_function.py
        ├── test_job.py
        └── test_visualization.py
    ├── pytest.ini
    ├── requirements.txt
    ├── test_array.py
    ├── test_cpus.py
    ├── test_datalayer.py
    ├── test_directives.py
    ├── test_entries.py
    ├── test_events.py
    ├── test_explain.py
    ├── test_job.py
    ├── test_job_mn.py
    ├── test_jobfile.py
    ├── test_journal.py
    ├── test_manager.py
    ├── test_placeholders.py
    ├── test_resources.py
    ├── test_server.py
    ├── test_stream.py
    ├── test_task.py
    ├── test_task_cleanup.py
    ├── test_time.py
    ├── test_utils.py
    ├── test_worker.py
    └── utils
        ├── __init__.py
        ├── cmd.py
        ├── io.py
        ├── job.py
        ├── mock.py
        ├── table.py
        └── wait.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | target/
2 | 


--------------------------------------------------------------------------------
/.github/renovate.json5:
--------------------------------------------------------------------------------
 1 | {
 2 |   $schema: "https://docs.renovatebot.com/renovate-schema.json",
 3 |   extends: [
 4 |     "config:recommended",
 5 |     // Enable the dependency dashboard issue
 6 |     ":dependencyDashboard",
 7 |   ],
 8 |   "schedule": [
 9 |     "at 7:00am on monday"
10 |   ],
11 |   // Group Rust updates into a single PR
12 |   "packageRules": [
13 |     {
14 |       "matchManagers": [
15 |         "cargo"
16 |       ],
17 |       "matchUpdateTypes": [
18 |         "minor",
19 |         "patch"
20 |       ],
21 |       "groupName": "Rust non-major dependencies",
22 |       "groupSlug": "rust-minor-patch"
23 |     }
24 |   ]
25 | }
26 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy_docs.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy documentation
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |     tags:
 8 |       - 'v*'
 9 | 
10 | concurrency: docs
11 | 
12 | jobs:
13 |   deploy:
14 |     runs-on: ubuntu-latest
15 |     if: github.repository_owner == 'It4innovations'
16 |     steps:
17 |       - uses: actions/checkout@v4
18 |         with:
19 |           # Download all git history to enable git revision history display in docs pages
20 |           fetch-depth: 0
21 |       - name: Install stable toolchain
22 |         uses: actions-rs/toolchain@v1
23 |         with:
24 |           profile: minimal
25 |           toolchain: 1.87.0
26 |           override: true
27 |           components: clippy, rustfmt
28 |       - uses: Swatinem/rust-cache@v2
29 |       - name: Set up Python
30 |         uses: actions/setup-python@v2
31 |         with:
32 |           python-version: 3.9
33 |       - name: Install dependencies
34 |         run: |
35 |           python -m pip install --upgrade pip wheel setuptools
36 |           python -m pip install -r docs/requirements.txt
37 |       - name: Install cli_doc
38 |         run: cargo install --git https://github.com/spirali/cli_doc
39 |       - name: Build HyperQueue
40 |         run: cargo build
41 |       - name: Build docs
42 |         run: mkdocs build
43 |       - name: Set Git CI config
44 |         run: |
45 |           git config user.name gh-ci-deploy-docs
46 |           git config user.email gh-ci-deploy-docs@github.com
47 |       - name: Calculate docs version
48 |         run: |
49 |           python3 scripts/get_docs_version.py > version.json
50 |           cat version.json
51 |           echo "VERSION=$(cat version.json)" >> $GITHUB_ENV
52 |       - name: Deploy latest docs
53 |         if: fromJson(env.VERSION).type == 'latest'
54 |         run: mike deploy --push latest
55 |       - name: Deploy stable docs
56 |         if: fromJson(env.VERSION).type == 'stable'
57 |         run: mike deploy --push -u ${{ fromJson(env.VERSION).version }} stable
58 | 


--------------------------------------------------------------------------------
/.github/workflows/nightly.yml:
--------------------------------------------------------------------------------
 1 | name: Create nightly build release
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   schedule:
 6 |     - cron: "0 23 * * *"
 7 | jobs:
 8 |   set-env:
 9 |     runs-on: ubuntu-latest
10 |     outputs:
11 |       version: ${{ env.HQ_VERSION }}
12 |     steps:
13 |       - name: Checkout sources
14 |         uses: actions/checkout@v4
15 |       - name: Set HQ nightly version
16 |         run: |
17 |           echo "HQ_VERSION=nightly-$(date +'%Y-%m-%d')-${{ github.sha }}" >> $GITHUB_ENV
18 |       - name: Print HQ nightly version
19 |         run: |
20 |           echo "HQ version: ${{ env.HQ_VERSION }}"
21 |   build-artifacts:
22 |     needs: [ set-env ]
23 |     uses: ./.github/workflows/build.yml
24 |     if: github.repository_owner == 'It4innovations'
25 |     with:
26 |       version: ${{ needs.set-env.outputs.version }}
27 |   create-tag:
28 |     runs-on: ubuntu-latest
29 |     needs: [ set-env, build-artifacts ]
30 |     steps:
31 |       - name: Checkout sources
32 |         uses: actions/checkout@v4
33 |       - name: Create tag
34 |         uses: rickstaa/action-create-tag@v1
35 |         with:
36 |           tag: nightly
37 |           force_push_tag: true
38 |           message: Nightly build ${{ needs.set-env.outputs.version }}
39 |   create-release:
40 |     runs-on: ubuntu-latest
41 |     needs: [ create-tag ]
42 |     steps:
43 |       - name: Checkout sources
44 |         uses: actions/checkout@v4
45 | 
46 |       - name: Generate changelog
47 |         run: python3 scripts/extract_changelog.py DEV > generated-changelog.md
48 | 
49 |       - name: Download artifacts
50 |         uses: actions/download-artifact@v4
51 | 
52 |       - name: Prepare release name
53 |         run: |
54 |           echo "RELEASE_NAME=Nightly build $(date +'%Y-%m-%d')" >> $GITHUB_ENV
55 | 
56 |       - name: Create release
57 |         uses: ncipollo/release-action@v1
58 |         id: create-release
59 |         with:
60 |           bodyFile: generated-changelog.md
61 |           token: ${{ secrets.GITHUB_TOKEN }}
62 |           allowUpdates: true
63 |           name: ${{ env.RELEASE_NAME }}
64 |           prerelease: true
65 |           tag: nightly
66 |           commit: ${{ github.sha }}
67 |           artifacts: archive-*/**
68 |           removeArtifacts: true
69 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | .idea
3 | __pycache__
4 | *.so
5 | *snap
6 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | members = [
 3 |     "crates/hyperqueue",
 4 |     "crates/tako",
 5 |     "crates/pyhq"
 6 | ]
 7 | default-members = [
 8 |     "crates/hyperqueue",
 9 |     "crates/tako"
10 | ]
11 | resolver = "2"
12 | 
13 | [workspace.package]
14 | rust-version = "1.87.0"
15 | edition = "2024"
16 | authors = ["Ada Böhm <ada@kreatrix.org>", "Jakub Beránek <berykubik@gmail.com>"]
17 | 
18 | [workspace.dependencies]
19 | tokio = "1"
20 | log = "0.4"
21 | env_logger = { version = "0.11", features = ["color"] }
22 | clap = "4"
23 | criterion = { version = "0.5", features = ["html_reports"] }
24 | derive_builder = "0.20"
25 | serde = { version = "1", features = ["rc"] }
26 | serde_json = "1"
27 | serde_bytes = "0.11"
28 | bytes = "1"
29 | chrono = "0.4"
30 | orion = "0.17"
31 | smallvec = "1"
32 | bincode = "1"
33 | futures = "0.3"
34 | tokio-util = "0.7"
35 | hex = "0.4"
36 | rand = "0.9"
37 | gethostname = "1.0"
38 | thiserror = "2"
39 | tempfile = "3.12.0"
40 | tracing = "0.1"
41 | anyhow = "1"
42 | nix = { version = "0.29", features = ["process", "signal"] }
43 | bstr = { version = "1", features = ["serde"] }
44 | psutil = "3"
45 | thin-vec = { version = "0.2", features = ["serde"] }
46 | bitflags = { version = "2", features = ["serde"] }
47 | 
48 | [workspace.lints.clippy]
49 | dbg_macro = "deny"
50 | 
51 | [profile.release]
52 | panic = "abort"
53 | 
54 | # Profile designed for the most optimized release build that is distributed
55 | # to users.
56 | [profile.dist]
57 | inherits = "release"
58 | lto = true
59 | codegen-units = 1
60 | debug = "line-tables-only"


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM lukemathwalker/cargo-chef:latest-rust-1 AS chef
 2 | 
 3 | ENV CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
 4 | WORKDIR /app
 5 | 
 6 | FROM chef as planner
 7 | 
 8 | COPY . .
 9 | RUN cargo chef prepare --recipe-path recipe.json
10 | 
11 | FROM chef AS builder
12 | WORKDIR /build
13 | COPY --from=planner /app/recipe.json recipe.json
14 | 
15 | # Build dependencies and cache them in a Docker layer
16 | RUN cargo chef cook --release --recipe-path recipe.json
17 | 
18 | # Build HyperQueue itself
19 | COPY . .
20 | RUN cargo build --release
21 | 
22 | FROM ubuntu:22.04 AS runtime
23 | 
24 | WORKDIR /
25 | COPY --from=builder /build/target/release/hq hq
26 | 
27 | ENTRYPOINT ["./hq"]
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021-present, Ada Böhm, Jakub Beranek
 4 | 
 5 | Permission is hereby granted, free of charge, to any person
 6 | obtaining a copy of this software and associated documentation
 7 | files (the "Software"), to deal in the Software without
 8 | restriction, including without limitation the rights to use,
 9 | copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the
11 | Software is furnished to do so, subject to the following
12 | conditions:
13 | 
14 | The above copyright notice and this permission notice shall be
15 | included in all copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24 | OTHER DEALINGS IN THE SOFTWARE.
25 | 


--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmark suite
 2 | This directory contains a framework for running various benchmarks.
 3 | 
 4 | It has support for spawning a distributed cluster for various tools (HQ, SnakeMake, ...), along with
 5 | node monitoring and profiling. Some features are only available for HyperQueue clusters.
 6 | 
 7 | The results of benchmarks are stored into JSON files, which can be used to generate HTML dashboards.
 8 | 
 9 | ## Quick start
10 | The benchmarks are meant to be launched from Python code. You can find some examples in `main.py`.
11 | To compare HyperQueue with zero-worker and with normal worker, you can run:
12 | ```bash
13 | $ python main.py compare-zw
14 | ```
15 | The results will be stored into `benchmarks/zw`.
16 | 
17 | ## Available profilers
18 | You can attach various profilers to the HyperQueue server or the workers. Use the `server_profilers`
19 | and/or `worker_profilers` attribute of `HqClusterInfo`.
20 | 
21 | ### Flamegraph (`FlamegraphProfiler`)
22 | Uses `perf` for stack sampling, results are rendered as a flamegraph.
23 | 
24 | ### Perf events (`PerfEventsProfiler`)
25 | Uses `perf stat` to gather various CPU performance events.
26 | 
27 | ### Callgrind (`CallgrindProfiler`)
28 | Uses Callgrind to instrument the profiled binary. The results can be visualized e.g. using KCacheGrind.
29 | Note that using Callgrind can slow down the execution by orders of magnitude.
30 | 
31 | ### Cachegrind (`CachegrindProfiler`)
32 | Uses Cachegrind to instrument the profiled binary. The results can be visualized e.g. using KCacheGrind.
33 | Note that using Cachegrind can slow down the execution by orders of magnitude.
34 | 


--------------------------------------------------------------------------------
/benchmarks/requirements.txt:
--------------------------------------------------------------------------------
 1 | typer==0.9.0
 2 | gitpython==3.1.41
 3 | git+https://code.it4i.cz/def/cluster.git@3e2bcf58c0224bd0e0889c8a0f95957e4d969ca6
 4 | pandas==1.3.3
 5 | tqdm==4.66.3
 6 | pyserde==0.12.3
 7 | psutil==5.8.0
 8 | humanize==3.12.0
 9 | git+https://github.com/it4innovations/snailwatch@4d590c55e6b1e404e0398e8005dd998f5bc50be9#subdirectory=client
10 | jinja2==3.1.6
11 | matplotlib==3.6.2
12 | distributed==2023.11.0
13 | dask==2023.11.0
14 | seaborn==0.13.0
15 | bokeh==2.4.3
16 | 


--------------------------------------------------------------------------------
/benchmarks/src/__init__.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | 
3 | CURRENT_DIR = Path(__file__).absolute().parent
4 | ROOT_DIR = CURRENT_DIR.parent.parent
5 | 


--------------------------------------------------------------------------------
/benchmarks/src/analysis/chart.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from matplotlib import pyplot as plt
 4 | 
 5 | 
 6 | def render_chart(path: Path):
 7 |     assert path.suffix == ""
 8 |     plt.savefig(f"{path}.png")
 9 |     plt.savefig(f"{path}.pdf")
10 | 


--------------------------------------------------------------------------------
/benchmarks/src/analysis/dataframe.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from typing import List, Any, Callable, Tuple
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from ..benchmark.database import Database, DatabaseRecord
 7 | 
 8 | 
 9 | class DataFrameExtractor:
10 |     def __init__(self, database: Database):
11 |         self.database = database
12 |         self.keys: List[str] = []
13 |         self.transforms: List[Tuple[str, Callable[[DatabaseRecord], Any]]] = []
14 | 
15 |     def extract(self, *args: str) -> "DataFrameExtractor":
16 |         self.keys.extend(args)
17 |         return self
18 | 
19 |     def transform(self, key: str, transform: Callable[[DatabaseRecord], Any]) -> "DataFrameExtractor":
20 |         self.transforms.append((key, transform))
21 |         return self
22 | 
23 |     def build(self) -> pd.DataFrame:
24 |         records = defaultdict(list)
25 | 
26 |         keys = frozenset(self.keys)
27 |         for record in self.database.records:
28 |             for key in keys:
29 |                 records[key].append(getattr(record, key))
30 |             for key, transform in self.transforms:
31 |                 records[key].append(transform(record))
32 |         return pd.DataFrame(records)
33 | 


--------------------------------------------------------------------------------
/benchmarks/src/benchmark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It4innovations/hyperqueue/aecde5c53dd41843cc566d87e6fe8036ba26a8fa/benchmarks/src/benchmark/__init__.py


--------------------------------------------------------------------------------
/benchmarks/src/benchmark/result.py:
--------------------------------------------------------------------------------
 1 | import dataclasses
 2 | 
 3 | 
 4 | @dataclasses.dataclass(frozen=True)
 5 | class BenchmarkResult:
 6 |     pass
 7 | 
 8 | 
 9 | @dataclasses.dataclass(frozen=True)
10 | class Timeout(BenchmarkResult):
11 |     timeout: float
12 | 
13 |     def __repr__(self):
14 |         return f"Timeout after {self.timeout}s"
15 | 
16 | 
17 | @dataclasses.dataclass(frozen=True)
18 | class Failure(BenchmarkResult):
19 |     traceback: str
20 | 
21 |     def __repr__(self):
22 |         return f"Failure: {self.traceback}"
23 | 
24 | 
25 | @dataclasses.dataclass(frozen=True)
26 | class Success(BenchmarkResult):
27 |     duration: float
28 | 
29 |     def __repr__(self):
30 |         return f"Success: {self.duration}s"
31 | 


--------------------------------------------------------------------------------
/benchmarks/src/build/repository.py:
--------------------------------------------------------------------------------
 1 | import contextlib
 2 | import logging
 3 | 
 4 | from git import Repo
 5 | from git.repo.fun import rev_parse
 6 | 
 7 | from .. import ROOT_DIR
 8 | 
 9 | REPO = Repo(ROOT_DIR)
10 | 
11 | # This tag represents the active git workspace
12 | TAG_WORKSPACE = "current"
13 | 
14 | 
15 | def resolve_tag(tag: str) -> str:
16 |     if tag == TAG_WORKSPACE:
17 |         return tag
18 |     return rev_parse(REPO, tag).hexsha
19 | 
20 | 
21 | @contextlib.contextmanager
22 | def checkout_tag(tag: str):
23 |     if tag == TAG_WORKSPACE:
24 |         yield
25 |     else:
26 |         active_branch = REPO.active_branch
27 | 
28 |         logging.info("Stashing repository")
29 |         msg = REPO.git.stash()
30 |         try:
31 |             aliases = REPO.git.name_rev(["--name-only", tag]).split()
32 |             logging.info(f"Checking out {tag} ({', '.join(aliases)})")
33 |             REPO.git.checkout(tag)
34 |             yield
35 |         finally:
36 |             logging.info(f"Reverting to original state ({active_branch})")
37 |             REPO.git.checkout(active_branch)
38 |             if "No local changes to save" not in msg:
39 |                 REPO.git.stash("pop")
40 | 


--------------------------------------------------------------------------------
/benchmarks/src/clusterutils/__init__.py:
--------------------------------------------------------------------------------
 1 | import dataclasses
 2 | 
 3 | from .node_list import NodeList
 4 | 
 5 | 
 6 | @dataclasses.dataclass(frozen=True)
 7 | class ClusterInfo:
 8 |     node_list: NodeList
 9 |     monitor_nodes: bool = False
10 | 


--------------------------------------------------------------------------------
/benchmarks/src/clusterutils/node_list.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import os
 3 | import socket
 4 | import subprocess
 5 | from typing import List, Optional
 6 | 
 7 | 
 8 | class NodeList(abc.ABC):
 9 |     def resolve(self) -> List[str]:
10 |         raise NotImplementedError()
11 | 
12 |     def is_localhost(self) -> bool:
13 |         return False
14 | 
15 | 
16 | class Local(NodeList):
17 |     HOSTNAME = socket.gethostname()
18 | 
19 |     def resolve(self) -> List[str]:
20 |         return [Local.HOSTNAME]
21 | 
22 |     def is_localhost(self) -> bool:
23 |         return True
24 | 
25 | 
26 | class PBS(NodeList):
27 |     def resolve(self) -> List[str]:
28 |         return get_pbs_nodes()
29 | 
30 | 
31 | def is_inside_pbs() -> bool:
32 |     return "PBS_NODEFILE" in os.environ
33 | 
34 | 
35 | def get_pbs_nodes() -> List[str]:
36 |     assert is_inside_pbs()
37 | 
38 |     with open(os.environ["PBS_NODEFILE"]) as f:
39 |         return [line.strip() for line in f]
40 | 
41 | 
42 | class Slurm(NodeList):
43 |     def resolve(self) -> List[str]:
44 |         return get_slurm_nodes()
45 | 
46 | 
47 | def get_slurm_nodes() -> List[str]:
48 |     assert is_inside_slurm()
49 |     output = subprocess.check_output(["scontrol", "show", "hostnames"])
50 |     return [node.strip() for node in output.decode().split("\n") if node.strip()]
51 | 
52 | 
53 | def is_inside_slurm() -> bool:
54 |     return "SLURM_NODELIST" in os.environ
55 | 
56 | 
57 | class Explicit(NodeList):
58 |     def __init__(self, nodes: List[str]):
59 |         self.nodes = nodes
60 | 
61 |     def resolve(self) -> List[str]:
62 |         return self.nodes
63 | 
64 | 
65 | def get_active_nodes() -> NodeList:
66 |     if is_inside_pbs():
67 |         return PBS()
68 |     elif is_inside_slurm():
69 |         return Slurm()
70 |     else:
71 |         return Local()
72 | 
73 | 
74 | def get_slurm_allocation_id() -> Optional[str]:
75 |     return os.environ.get("SLURM_JOB_ID")
76 | 


--------------------------------------------------------------------------------
/benchmarks/src/environment/__init__.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Any, Dict
 3 | 
 4 | 
 5 | class Environment:
 6 |     def start(self):
 7 |         raise NotImplementedError
 8 | 
 9 |     def stop(self):
10 |         raise NotImplementedError
11 | 
12 |     def __enter__(self):
13 |         self.start()
14 |         return self
15 | 
16 |     def __exit__(self, exc_type, exc_val, exc_tb):
17 |         self.stop()
18 | 
19 | 
20 | class EnvironmentDescriptor:
21 |     """
22 |     This class should describe an instance of an environment.
23 |     The class has to be easily picklable and able to create new environments.
24 |     It also has to be able to describe itself using metadata.
25 |     """
26 | 
27 |     def create_environment(self, workdir: Path) -> Environment:
28 |         raise NotImplementedError
29 | 
30 |     def name(self) -> str:
31 |         raise NotImplementedError
32 | 
33 |     def parameters(self) -> Dict[str, Any]:
34 |         raise NotImplementedError
35 | 
36 |     def metadata(self) -> Dict[str, Any]:
37 |         return {}
38 | 


--------------------------------------------------------------------------------
/benchmarks/src/environment/snake.py:
--------------------------------------------------------------------------------
 1 | import dataclasses
 2 | import logging
 3 | from pathlib import Path
 4 | from typing import Any, Dict
 5 | 
 6 | from . import Environment, EnvironmentDescriptor
 7 | from .utils import EnvStateManager
 8 | 
 9 | 
10 | @dataclasses.dataclass(frozen=True)
11 | class SnakeClusterInfo:
12 |     workdir: Path
13 | 
14 | 
15 | class SnakeEnvironmentDescriptor(EnvironmentDescriptor):
16 |     def create_environment(self, workdir: Path) -> Environment:
17 |         info = SnakeClusterInfo(workdir)
18 |         return SnakeEnvironment(info)
19 | 
20 |     def name(self) -> str:
21 |         return "snake"
22 | 
23 |     def parameters(self) -> Dict[str, Any]:
24 |         return {}
25 | 
26 |     def metadata(self) -> Dict[str, Any]:
27 |         return {}
28 | 
29 | 
30 | class SnakeEnvironment(Environment, EnvStateManager):
31 |     def __init__(self, info: SnakeClusterInfo):
32 |         EnvStateManager.__init__(self)
33 |         self.info = info
34 |         self.snakefile = info.workdir / "Snakefile"
35 | 
36 |     @property
37 |     def workdir(self) -> Path:
38 |         return self.info.workdir
39 | 
40 |     def start(self):
41 |         self.state_start()
42 | 
43 |     def stop(self):
44 |         self.state_stop()
45 | 
46 |     def submit(self, cmds: str, cpus_per_task: int):
47 |         logging.info(f"Starting Snakemake {cmds, cpus_per_task}")
48 |         with open(self.snakefile, "w") as f:
49 |             f.writelines(cmds)
50 | 
51 |         from snakemake import snakemake
52 | 
53 |         ret = snakemake(
54 |             snakefile=str(self.snakefile),
55 |             quiet=True,
56 |             cores=cpus_per_task,
57 |             workdir=str(self.workdir),
58 |         )
59 |         if not ret:
60 |             raise Exception(
61 |                 f"SnakeMake execution failed. You can find more details in {self.workdir / '.snakemake' / 'log'}"
62 |             )
63 | 


--------------------------------------------------------------------------------
/benchmarks/src/environment/utils.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from typing import Dict, List, Optional, Protocol
 3 | 
 4 | 
 5 | class Init:
 6 |     pass
 7 | 
 8 | 
 9 | class Started:
10 |     pass
11 | 
12 | 
13 | class Stopped:
14 |     pass
15 | 
16 | 
17 | class EnvStateManager:
18 |     """
19 |     Helper mixin class that makes sure that an environment is used in the correct order and that
20 |     it is not started/stopped multiple times.
21 |     """
22 | 
23 |     def __init__(self):
24 |         self.state = Init()
25 | 
26 |     def state_start(self):
27 |         assert isinstance(self.state, Init)
28 |         self.state = Started()
29 | 
30 |     def state_stop(self):
31 |         assert isinstance(self.state, Started)
32 |         self.state = Stopped()
33 | 
34 | 
35 | def sanity_check_nodes(nodes: List[str]):
36 |     for node in nodes:
37 |         assert len(node) > 0
38 |     assert len(set(nodes)) == len(nodes)
39 |     assert len(nodes) > 0
40 | 
41 | 
42 | class WorkerConfig(Protocol):
43 |     node: Optional[int]
44 | 
45 | 
46 | def assign_workers(workers: List[WorkerConfig], nodes: List[str]) -> Dict[str, List[WorkerConfig]]:
47 |     round_robin_node = 0
48 |     used_round_robin = set()
49 | 
50 |     node_assignments = defaultdict(list)
51 |     for index, worker in enumerate(workers):
52 |         node = worker.node
53 |         if node is not None:
54 |             if not (0 <= node < len(nodes)):
55 |                 raise Exception(
56 |                     f"Invalid node assignment. Worker {index} wants to be on node "
57 |                     f"{node}, but there are only {len(nodes)} worker nodes"
58 |                 )
59 |         else:
60 |             node = round_robin_node
61 |             round_robin_node = (round_robin_node + 1) % len(nodes)
62 |             if node in used_round_robin:
63 |                 raise Exception(f"There are more workers ({len(workers)}) than worker nodes ({len(nodes)})")
64 |             used_round_robin.add(node)
65 |         if node >= len(nodes):
66 |             raise Exception(f"Selected worker node is {node}, but there are only {len(nodes)} worker node(s)")
67 |         node_assignments[nodes[node]].append(worker)
68 |     return dict(node_assignments)
69 | 


--------------------------------------------------------------------------------
/benchmarks/src/executor/executor.py:
--------------------------------------------------------------------------------
 1 | import dataclasses
 2 | from pathlib import Path
 3 | 
 4 | from ..benchmark.identifier import BenchmarkDescriptor
 5 | from ..benchmark.result import BenchmarkResult
 6 | 
 7 | 
 8 | @dataclasses.dataclass
 9 | class BenchmarkContext:
10 |     workdir: Path
11 |     timeout_s: float
12 | 
13 |     def __post_init__(self):
14 |         self.workdir = self.workdir.resolve()
15 | 
16 | 
17 | class BenchmarkExecutor:
18 |     def execute(self, benchmark: BenchmarkDescriptor, ctx: BenchmarkContext) -> BenchmarkResult:
19 |         raise NotImplementedError
20 | 


--------------------------------------------------------------------------------
/benchmarks/src/executor/executor_script.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pickle
 3 | import sys
 4 | 
 5 | from ..utils import activate_cwd
 6 | from .local_executor import execute_benchmark
 7 | from .serialization import SerializedBenchmark, serialize_result
 8 | 
 9 | if __name__ == "__main__":
10 |     pipe_path = sys.argv[1]
11 | 
12 |     data = sys.stdin.buffer.read()
13 |     benchmark = pickle.loads(data)
14 |     assert isinstance(benchmark, SerializedBenchmark)
15 | 
16 |     with activate_cwd(benchmark.cwd):
17 |         result = execute_benchmark(benchmark.descriptor, benchmark.ctx)
18 |         serialized_result = serialize_result(result)
19 |         with open(pipe_path, "w") as file:
20 |             print(json.dumps(serialized_result), file=file)
21 | 


--------------------------------------------------------------------------------
/benchmarks/src/executor/local_executor.py:
--------------------------------------------------------------------------------
 1 | import traceback
 2 | 
 3 | from ..benchmark.identifier import BenchmarkDescriptor
 4 | from ..benchmark.result import BenchmarkResult, Failure, Success, Timeout
 5 | from ..utils.timing import TimeoutException, with_timeout
 6 | from ..workloads.workload import WorkloadExecutionResult
 7 | from .executor import BenchmarkContext, BenchmarkExecutor
 8 | 
 9 | 
10 | class LocalBenchmarkExecutor(BenchmarkExecutor):
11 |     """Executes benchmarks in the current process"""
12 | 
13 |     def execute(self, benchmark: BenchmarkDescriptor, ctx: BenchmarkContext) -> BenchmarkResult:
14 |         return execute_benchmark(benchmark, ctx)
15 | 
16 | 
17 | def execute_benchmark(descriptor: BenchmarkDescriptor, ctx: BenchmarkContext) -> BenchmarkResult:
18 |     env = descriptor.env_descriptor.create_environment(ctx.workdir)
19 |     workload = descriptor.workload
20 | 
21 |     def run() -> WorkloadExecutionResult:
22 |         return workload.execute(env)
23 | 
24 |     try:
25 |         with env:
26 |             result = with_timeout(run, timeout_s=ctx.timeout_s)
27 |             return Success(duration=result.duration)
28 |     except TimeoutException:
29 |         return Timeout(ctx.timeout_s)
30 |     except BaseException:
31 |         return Failure(traceback.format_exc())
32 | 


--------------------------------------------------------------------------------
/benchmarks/src/executor/serialization.py:
--------------------------------------------------------------------------------
 1 | import dataclasses
 2 | from pathlib import Path
 3 | from typing import Any, Dict
 4 | 
 5 | from ..benchmark.identifier import BenchmarkDescriptor
 6 | from ..benchmark.result import BenchmarkResult, Failure, Success, Timeout
 7 | from .executor import BenchmarkContext
 8 | 
 9 | 
10 | @dataclasses.dataclass(frozen=True)
11 | class SerializedBenchmark:
12 |     descriptor: BenchmarkDescriptor
13 |     ctx: BenchmarkContext
14 |     cwd: Path
15 | 
16 | 
17 | def serialize_result(result: BenchmarkResult) -> Dict[str, Any]:
18 |     if isinstance(result, Success):
19 |         type = "success"
20 |     elif isinstance(result, Timeout):
21 |         type = "timeout"
22 |     elif isinstance(result, Failure):
23 |         type = "failure"
24 |     else:
25 |         assert False
26 |     return dict(type=type, data=result.to_dict())
27 | 
28 | 
29 | def deserialize_result(data: Dict[str, Any]) -> BenchmarkResult:
30 |     type = data["type"]
31 |     data = data["data"]
32 |     if type == "success":
33 |         return Success.from_dict(data)
34 |     elif type == "timeout":
35 |         return Timeout.from_dict(data)
36 |     elif type == "failure":
37 |         return Failure.from_dict(data)
38 |     else:
39 |         assert False
40 | 


--------------------------------------------------------------------------------
/benchmarks/src/monitoring/monitor_script.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import shutil
 4 | import sys
 5 | import time
 6 | 
 7 | import click
 8 | import psutil
 9 | from cluster.io import measure_and_store
10 | from record import generate_record, MonitoringOptions
11 | 
12 | 
13 | @click.command()
14 | @click.argument("output")
15 | @click.option("--capture-interval", default=1)
16 | @click.option("--dump-interval", default=10)
17 | @click.option("--observe-pids", default="")
18 | def main(output: str, capture_interval: int, dump_interval: int, observe_pids: str):
19 |     options = MonitoringOptions(observe_network=False)
20 | 
21 |     processes = []
22 |     process_map = {}
23 |     for pid in observe_pids.split(","):
24 |         if pid:
25 |             try:
26 |                 processes.append(psutil.Process(int(pid)))
27 |                 logging.info(f"Observing PID {pid}")
28 |             except BaseException as e:
29 |                 logging.error(e)
30 | 
31 |     def capture(timestamp):
32 |         try:
33 |             start = time.time()
34 |             result = generate_record(timestamp, processes, process_map, options)
35 |             duration = time.time() - start
36 |             logging.info(f"Capturing data took {duration:.5f}s")
37 |             return result
38 |         except Exception as e:
39 |             logging.error("Opening cluster exception: {}".format(e))
40 |             return None
41 | 
42 |     def finish():
43 |         logging.info(f"Copying trace from {tmp_output} to {output}")
44 |         shutil.copyfile(tmp_output, output)
45 |         sys.exit()
46 | 
47 |     tmp_output = f"/tmp/{os.path.basename(output)}-{int(time.time())}"
48 | 
49 |     # Create temporary file
50 |     with open(tmp_output, "w") as _:
51 |         pass
52 | 
53 |     measure_and_store(capture_interval, dump_interval, tmp_output, capture, finish)
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     logging.basicConfig(
58 |         level=logging.INFO,
59 |         format="%(levelname)s:%(asctime)s:%(funcName)s: %(message)s",
60 |         datefmt="%Y-%m-%d %H:%M:%S",
61 |     )
62 |     main()
63 | 


--------------------------------------------------------------------------------
/benchmarks/src/postprocessing/templates/benchmark.html:
--------------------------------------------------------------------------------
 1 | <b>Duration</b>: {{ "%.4f"|format(benchmark.record.duration) }} s
 2 | {% if benchmark.process_stats %}
 3 |     <h3>Process utilization</h3>
 4 |     <table>
 5 |         <thead>
 6 |         <th>Hostname</th>
 7 |         <th>Key</th>
 8 |         <th>Avg. CPU</th>
 9 |         <th>Max. RSS</th>
10 |         </thead>
11 |         <tbody>
12 |         {% for (k, v) in benchmark.process_stats.items() %}
13 |             <tr>
14 |                 <td>{{ k[0] }}</td>
15 |                 <td>{{ k[1] }}</td>
16 |                 <td>{{ "%.2f"|format(v.avg_cpu) }} %</td>
17 |                 <td>{{ format_bytes(v.max_rss) }}</td>
18 |             </tr>
19 |         {% endfor %}
20 |         </tbody>
21 |     </table>
22 | {% endif %}
23 | {% if node_utilization %}
24 |     <h3>Node utilization</h3>
25 |     <table>
26 |         <thead>
27 |         <th>Hostname</th>
28 |         <th>Avg. CPU</th>
29 |         <th>Avg. memory</th>
30 |         </thead>
31 |         <tbody>
32 |         {% for (hostname, data) in node_utilization.items() %}
33 |             <tr>
34 |                 <td>{{ hostname }}</td>
35 |                 <td>{{ "%.2f"|format(data["cpu"]) }} %</td>
36 |                 <td>{{ "%.2f"|format(data["memory"]) }} %</td>
37 |             </tr>
38 |         {% endfor %}
39 |         </tbody>
40 |     </table>
41 | {% endif %}
42 | {% if benchmark.monitoring_report %}
43 |     <a href='{{ benchmark.monitoring_report }}'>Cluster report</a>
44 | {% endif %}


--------------------------------------------------------------------------------
/benchmarks/src/postprocessing/templates/compare_table.html:
--------------------------------------------------------------------------------
 1 | <style>
 2 |     th, td {
 3 |         border: 1px solid black;
 4 |     }
 5 |     .blank{
 6 |         visibility: hidden;
 7 |     }
 8 |     tr {
 9 |         display: block;
10 |         float: left;
11 |     }
12 | 
13 |     th, td {
14 |         display: block;
15 |     }
16 | 
17 |     #T_table0 {
18 |         margin-right: auto;
19 |         position: absolute;
20 |         float: left;
21 |     }
22 | 
23 |     #T_table1 {
24 |         margin-left: 40%;
25 |         postion: relative;
26 | 
27 |     }
28 |     #scatter{
29 |         margin-left: auto;
30 |         margin-right: auto;
31 |         display: block;
32 |     }
33 | </style>
34 | 
35 | {{tables}}
36 | <br>
37 | <img src="/img" id="scatter">
38 | 
39 | <script>
40 |     var rows = document.getElementsByClassName("col_heading level0 col0");
41 |     for (let i = 0; i <= rows.length; i++) {
42 |         rows[i].style['visibility'] = "hidden";
43 |     }
44 |     var rows = document.getElementsByClassName("blank level0");
45 |     console.log(rows.length);
46 |     for (let i = 0; i <= rows.length; i++) {
47 |         console.log(rows[i]);
48 |         rows[i].style['visibility'] = "hidden";
49 |     }
50 | 
51 | 
52 | </script>


--------------------------------------------------------------------------------
/benchmarks/src/postprocessing/templates/summary.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <header>
 3 |     <style>
 4 |         p, table {
 5 |             margin: auto;
 6 |             max-width: max-content;
 7 |         }
 8 | 
 9 |         table {
10 |             width: 50%;
11 |             display: block;
12 |         }
13 | 
14 |         th, tr {
15 |             width: 1%;
16 |         }
17 | 
18 |         .benchmark_klikatko:hover {
19 |             text-decoration: underline;
20 |         }
21 | 
22 |         .benchmark_klikatko:active {
23 |             text-decoration: underline;
24 |         }
25 | 
26 |         .benchmark_klikatko:visited {
27 |             text-decoration: none;
28 |             color: black;
29 |         }
30 | 
31 |     </style>
32 | </header>
33 | <body>
34 | {% for group in keys %}
35 |     <p style="font-size: 20px"><b>{{ group|e }}</b></p>
36 |     <p>
37 |         {% for x in data[group] %}
38 |             {{ x|e }}<br>
39 |         {% endfor %}
40 |     </p>
41 | {% endfor %}
42 | <p style="font-size: 20px"><b>Grouped by benchmark:</b></p>
43 | {% for name,group in data["Grouped by benchmark:"] %}
44 |     <p class="benchmark_klikatko" onclick="hello('{{ name|e }}')">{{ name|e }}</p>
45 |     {{ group }}<br>
46 | 
47 | {% endfor %}
48 | <p></p>
49 | </body>
50 | <script>
51 | 
52 |     function hello(key) {
53 |         self.location = "monitoring/" + key;
54 |     }
55 | 
56 | </script>
57 | </html>


--------------------------------------------------------------------------------
/benchmarks/src/postprocessing/templates/workload.html:
--------------------------------------------------------------------------------
 1 | <style>
 2 |     .upperMenu {
 3 |         overflow: hidden;
 4 |         background-color: #EEEEEE;
 5 |     }
 6 | 
 7 |     .upperMenu a {
 8 |         float: left;
 9 |         text-align: center;
10 |         padding: 25px 16px;
11 |         text-decoration: none;
12 |         font-size: 17px;
13 |     }
14 | 
15 |     .upperMenu a:hover {
16 |         background-color: #DDDDDD;
17 |     }
18 | </style>
19 | 
20 | <div class="upperMenu">
21 |     {% for key in environments %}
22 |     <a onclick="showEnvironment('{{key}}')">{{ key }}</a>
23 |     {% endfor %}
24 | </div>
25 | 
26 | {% for key in environments %}
27 | <div id="{{key}}" style="display:none;">{{ environments[key] }}</div>
28 | {% endfor %}
29 | 
30 | <script>
31 |     function showEnvironment(key) {
32 |         document.getElementById("environment").innerHTML = document.getElementById(key).innerHTML
33 |         document.getElementById("iframe_environment").style['visibility'] = 'hidden';
34 |     }
35 | </script>


--------------------------------------------------------------------------------
/benchmarks/src/submit/options.py:
--------------------------------------------------------------------------------
 1 | import dataclasses
 2 | import datetime
 3 | from pathlib import Path
 4 | from typing import Optional
 5 | 
 6 | from ..utils.io import from_json, to_json
 7 | 
 8 | 
 9 | @dataclasses.dataclass(frozen=True)
10 | class PBSSubmitOptions:
11 |     queue: str
12 |     nodes: int
13 |     walltime: datetime.timedelta
14 |     project: Optional[str] = None
15 |     name: Optional[str] = None
16 |     init_script: Optional[Path] = None
17 | 
18 | 
19 | def serialize_submit_options(options: PBSSubmitOptions, path: Path):
20 |     with open(path, "w") as f:
21 |         to_json(options, f)
22 | 
23 | 
24 | def deserialize_submit_options(path: Path) -> PBSSubmitOptions:
25 |     with open(path) as f:
26 |         return from_json(PBSSubmitOptions, f)
27 | 


--------------------------------------------------------------------------------
/benchmarks/src/submit/utils.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | from pathlib import Path
 3 | 
 4 | 
 5 | def generate_job_dir(workdir: Path) -> Path:
 6 |     """Tries to find a directory in `workdir` which name is an integer and return a large integer
 7 |     padded. The returned name is padded by zeros."""
 8 |     workdir.mkdir(parents=True, exist_ok=True)
 9 | 
10 |     ids = []
11 |     for item in workdir.iterdir():
12 |         if item.is_dir():
13 |             try:
14 |                 ids.append(int(item.name))
15 |             except BaseException:
16 |                 pass
17 |     max_id = max(ids or [0])
18 |     dir_name = f"{max_id + 1:03}"
19 |     return (workdir / dir_name).absolute()
20 | 
21 | 
22 | def format_allocation_time(duration: datetime.timedelta) -> str:
23 |     days, seconds = duration.days, duration.seconds
24 |     hours = days * 24 + seconds // 3600
25 |     minutes = (seconds % 3600) // 60
26 |     seconds = seconds % 60
27 | 
28 |     return f"{hours:02}:{minutes:02}:{seconds:02}"
29 | 


--------------------------------------------------------------------------------
/benchmarks/src/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | import contextlib
 2 | import os
 3 | import shutil
 4 | from pathlib import Path
 5 | 
 6 | 
 7 | def get_pyenv_from_env() -> str:
 8 |     return os.environ.get("VIRTUAL_ENV")
 9 | 
10 | 
11 | def ensure_directory(path: Path) -> Path:
12 |     path.mkdir(parents=True, exist_ok=True)
13 |     return path.absolute()
14 | 
15 | 
16 | def check_file_exists(path: Path):
17 |     if not path.exists():
18 |         raise Exception(f"Path {path} does not exist")
19 |     if not path.is_file():
20 |         raise Exception(f"Path {path} is not a file")
21 | 
22 | 
23 | def is_binary_available(binary: str) -> bool:
24 |     return shutil.which(binary) is not None
25 | 
26 | 
27 | @contextlib.contextmanager
28 | def activate_cwd(directory: Path):
29 |     cwd = os.getcwd()
30 |     os.chdir(directory)
31 | 
32 |     try:
33 |         yield
34 |     finally:
35 |         os.chdir(cwd)
36 | 


--------------------------------------------------------------------------------
/benchmarks/src/utils/io.py:
--------------------------------------------------------------------------------
 1 | import typing
 2 | from typing import TypeVar
 3 | 
 4 | Type = TypeVar("Type")
 5 | 
 6 | 
 7 | def from_json(cls: type[Type], input: typing.Union[typing.TextIO, str]) -> Type:
 8 |     from serde import json
 9 | 
10 |     if not isinstance(input, str):
11 |         input = input.read()
12 |     return json.from_json(cls, input)
13 | 
14 | 
15 | def to_json(object: typing.Any, file: typing.TextIO):
16 |     from serde import json
17 | 
18 |     serialized = json.to_json(object)
19 |     file.write(serialized)
20 | 


--------------------------------------------------------------------------------
/benchmarks/src/utils/process.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from pathlib import Path
 3 | from typing import Dict, List, Optional
 4 | 
 5 | 
 6 | def execute_process(
 7 |     args: List[str],
 8 |     stdout: Path,
 9 |     stderr: Path,
10 |     env: Optional[Dict[str, str]] = None,
11 |     check=True,
12 | ) -> subprocess.CompletedProcess:
13 |     with open(stdout, "wb") as stdout_file:
14 |         with open(stderr, "wb") as stderr_file:
15 |             env = env or {}
16 |             result = subprocess.run(
17 |                 args,
18 |                 env=env,
19 |                 stdin=subprocess.DEVNULL,
20 |                 stdout=stdout_file,
21 |                 stderr=stderr_file,
22 |             )
23 |     if check:
24 |         if result.returncode != 0:
25 |             with open(stdout) as stdout_file:
26 |                 with open(stderr) as stderr_file:
27 |                     raise Exception(
28 |                         f"""The process {args} has exited with error code {result.returncode}
29 | Stdout: {stdout_file.read()}
30 | Stderr: {stderr_file.read()}
31 | """.strip()
32 |                     )
33 |     return result
34 | 


--------------------------------------------------------------------------------
/benchmarks/src/utils/timing.py:
--------------------------------------------------------------------------------
 1 | import atexit
 2 | import contextlib
 3 | import multiprocessing.context
 4 | import time
 5 | from multiprocessing.pool import ThreadPool
 6 | from typing import Callable, TypeVar
 7 | 
 8 | DEFAULT_TIMEOUT = 15
 9 | 
10 | 
11 | class TimeoutException(BaseException):
12 |     pass
13 | 
14 | 
15 | def wait_until(fn, sleep_s=0.5, timeout_s=DEFAULT_TIMEOUT):
16 |     end = time.time() + timeout_s
17 | 
18 |     while time.time() < end:
19 |         value = fn()
20 |         if value is not None and value is not False:
21 |             return value
22 |         time.sleep(sleep_s)
23 |     raise TimeoutException(f"Wait timeouted after {timeout_s} seconds")
24 | 
25 | 
26 | TIMEOUT_POOL = None
27 | T = TypeVar("T")
28 | 
29 | 
30 | def with_timeout(fn: Callable[..., T], timeout_s: float) -> T:
31 |     global TIMEOUT_POOL
32 | 
33 |     if TIMEOUT_POOL is None:
34 |         # it needs to be more than 1 to avoid deadlocks when with_timeout is nested
35 |         TIMEOUT_POOL = ThreadPool(8)
36 |         atexit.register(TIMEOUT_POOL.close)
37 | 
38 |     future = TIMEOUT_POOL.apply_async(fn)
39 |     try:
40 |         return future.get(timeout=timeout_s)
41 |     except multiprocessing.context.TimeoutError:
42 |         raise TimeoutException()
43 | 
44 | 
45 | class Timings:
46 |     def __init__(self):
47 |         self.timings = {}
48 | 
49 |     def add(self, name, duration):
50 |         assert name not in self.timings
51 |         self.timings[name] = duration
52 | 
53 |     def duration(self) -> float:
54 |         return self.timings["duration"]
55 | 
56 |     def to_dict(self):
57 |         return dict(self.timings)
58 | 
59 |     def __repr__(self):
60 |         return repr(self.timings)
61 | 
62 |     @contextlib.contextmanager
63 |     def time(self, name="duration"):
64 |         start = time.time()
65 |         yield
66 |         duration = time.time() - start
67 |         self.add(name, duration)
68 | 


--------------------------------------------------------------------------------
/benchmarks/src/workloads/__init__.py:
--------------------------------------------------------------------------------
1 | from .sleep import SleepHQ
2 | from .stress import StressHQ
3 | from .workload import Workload
4 | 
5 | __all__ = [Workload, SleepHQ, StressHQ]
6 | 


--------------------------------------------------------------------------------
/benchmarks/src/workloads/empty.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Any
 2 | 
 3 | from .utils import measure_dask_tasks
 4 | from .workload import Workload, WorkloadExecutionResult
 5 | from ..environment.dask import DaskEnvironment
 6 | 
 7 | 
 8 | def empty():
 9 |     pass
10 | 
11 | 
12 | class EmptyDask(Workload):
13 |     def __init__(self, task_count: int):
14 |         self.task_count = task_count
15 | 
16 |     def name(self) -> str:
17 |         return "empty"
18 | 
19 |     def parameters(self) -> Dict[str, Any]:
20 |         return {"task_count": self.task_count}
21 | 
22 |     def execute(self, env: DaskEnvironment) -> WorkloadExecutionResult:
23 |         from distributed import Client
24 | 
25 |         def run(client: Client):
26 |             tasks = [client.submit(empty, pure=False) for _ in range(self.task_count)]
27 |             client.gather(tasks)
28 | 
29 |         return measure_dask_tasks(env, run)
30 | 


--------------------------------------------------------------------------------
/benchmarks/src/workloads/sleep_resources.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | from typing import Any, Dict
 3 | 
 4 | from ..environment.hq import HqEnvironment
 5 | from .utils import measure_hq_tasks
 6 | from .workload import Workload, WorkloadExecutionResult
 7 | 
 8 | 
 9 | class SleepWithResources(Workload, ABC):
10 |     def __init__(self, task_count: int, resources: Dict[str, Any], sleep_duration=0):
11 |         self.task_count = task_count
12 |         self.resources = resources
13 |         self.sleep_duration = sleep_duration
14 | 
15 |     def parameters(self) -> Dict[str, Any]:
16 |         return dict(
17 |             task_count=self.task_count,
18 |             resources=self.resources,
19 |             duration=self.sleep_duration,
20 |         )
21 | 
22 |     def name(self) -> str:
23 |         return "sleep-with-resources"
24 | 
25 | 
26 | class SleepWithResourcesHQ(SleepWithResources):
27 |     def execute(self, env: HqEnvironment) -> WorkloadExecutionResult:
28 |         return measure_hq_tasks(
29 |             env,
30 |             ["sleep", str(self.sleep_duration)],
31 |             task_count=self.task_count,
32 |             resources=self.resources,
33 |         )
34 | 


--------------------------------------------------------------------------------
/benchmarks/src/workloads/stress.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | from abc import ABC
 3 | from typing import Any, Dict, Optional
 4 | 
 5 | from ..environment import Environment
 6 | from ..utils import is_binary_available
 7 | from .utils import measure_hq_tasks
 8 | from .workload import Workload, WorkloadExecutionResult
 9 | 
10 | 
11 | class Stress(Workload, ABC):
12 |     def __init__(
13 |         self,
14 |         task_count: int,
15 |         cpu_count: Optional[int] = None,
16 |         stress_duration=1,
17 |     ):
18 |         self.task_count = task_count
19 |         self.cpu_count = cpu_count
20 |         self.stress_duration = stress_duration
21 | 
22 |     def name(self) -> str:
23 |         return "stress"
24 | 
25 |     def parameters(self) -> Dict[str, Any]:
26 |         return dict(
27 |             task_count=self.task_count,
28 |             cpu_count=self.cpu_count,
29 |             duration=self.stress_duration,
30 |         )
31 | 
32 |     def execute(self, env: Environment) -> WorkloadExecutionResult:
33 |         assert is_binary_available("stress")
34 | 
35 |         cpu_count = self.cpu_count or multiprocessing.cpu_count()
36 |         return self.compute(
37 |             env,
38 |             task_count=self.task_count,
39 |             cpu_count=cpu_count,
40 |             stress_duration=self.stress_duration,
41 |         )
42 | 
43 |     def compute(
44 |         self, env: Environment, task_count: int, cpu_count: int, stress_duration: int
45 |     ) -> WorkloadExecutionResult:
46 |         raise NotImplementedError
47 | 
48 | 
49 | class StressHQ(Stress):
50 |     def compute(
51 |         self, env: Environment, task_count: int, cpu_count: int, stress_duration: int
52 |     ) -> WorkloadExecutionResult:
53 |         return measure_hq_tasks(
54 |             env,
55 |             ["stress", "--cpu", str(cpu_count), "--timeout", str(stress_duration)],
56 |             task_count=task_count,
57 |             cpus_per_task=cpu_count,
58 |         )
59 | 


--------------------------------------------------------------------------------
/benchmarks/src/workloads/workload.py:
--------------------------------------------------------------------------------
 1 | import dataclasses
 2 | from typing import Any, Dict
 3 | 
 4 | from ..environment import Environment
 5 | 
 6 | 
 7 | @dataclasses.dataclass
 8 | class WorkloadExecutionResult:
 9 |     duration: float
10 | 
11 | 
12 | class Workload:
13 |     def name(self) -> str:
14 |         raise NotImplementedError
15 | 
16 |     def parameters(self) -> Dict[str, Any]:
17 |         raise NotImplementedError
18 | 
19 |     def execute(self, env: Environment) -> WorkloadExecutionResult:
20 |         raise NotImplementedError
21 | 


--------------------------------------------------------------------------------
/benchmarks/sw_upload.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from pathlib import Path
 3 | from typing import Any, Dict
 4 | 
 5 | import typer
 6 | from src.utils import load_database
 7 | from swclient.client import Client, Measurement
 8 | 
 9 | app = typer.Typer()
10 | 
11 | 
12 | def unpack_dict(dictionary: Dict[str, Any]) -> Dict[str, str]:
13 |     result = {}
14 |     for key, value in dictionary.items():
15 |         if isinstance(value, dict):
16 |             value = unpack_dict(value)
17 |             for k, v in value.items():
18 |                 result[f"{key}/{k}"] = v
19 |         else:
20 |             result[key] = value
21 |     return result
22 | 
23 | 
24 | def prefix_dict(dictionary: Dict[str, Any], prefix: str) -> Dict[str, Any]:
25 |     return {f"{prefix}/{k}": str(v) for (k, v) in dictionary.items()}
26 | 
27 | 
28 | def normalize_dict(dictionary: Dict[str, Any], prefix: str) -> Dict[str, Any]:
29 |     return prefix_dict(unpack_dict(dictionary), prefix=prefix)
30 | 
31 | 
32 | @app.command()
33 | def upload(
34 |     database_path: Path = typer.Argument(..., exists=True),
35 |     token: str = typer.Option(...),
36 | ):
37 |     client = Client("https://snailwatch.it4i.cz/api", token)
38 |     database = load_database(database_path)
39 |     measurements = []
40 | 
41 |     for record in database.records:
42 |         timestamp = datetime.fromtimestamp(record.timestamp)
43 |         measurement = Measurement(
44 |             benchmark=record.workload,
45 |             environment=dict(
46 |                 **normalize_dict(record.workload_params, "workload"),
47 |                 **normalize_dict(record.environment_params, "env"),
48 |                 env=record.environment_params,
49 |                 **normalize_dict(record.benchmark_metadata, "metadata"),
50 |             ),
51 |             result=dict(duration=dict(type="time", value=record.duration)),
52 |             timestamp=timestamp,
53 |         )
54 |         measurements.append(measurement)
55 | 
56 |     client.upload_measurements(measurements)
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     app()
61 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/benches/benchmark.rs:
--------------------------------------------------------------------------------
 1 | use criterion::measurement::WallTime;
 2 | use criterion::{BenchmarkGroup, Criterion, criterion_group, criterion_main};
 3 | use hyperqueue::common::placeholders::{has_placeholders, parse_resolvable_string};
 4 | 
 5 | fn bench_parse_placeholder(c: &mut BenchmarkGroup<WallTime>) {
 6 |     c.bench_function("no placeholders", |bencher| {
 7 |         bencher.iter(|| {
 8 |             parse_resolvable_string("/tmp/my-very-long-path/that-is-even-longer-than-we-thought")
 9 |         });
10 |     });
11 |     c.bench_function("single placeholder", |bencher| {
12 |         bencher.iter(|| {
13 |             parse_resolvable_string(
14 |                 "/tmp/my-very-long-path/%{TASK_ID}/that-is-even-longer-than-we-thought",
15 |             )
16 |         });
17 |     });
18 |     c.bench_function("has_placeholders without placeholder", |bencher| {
19 |         bencher.iter(|| {
20 |             has_placeholders("/tmp/my-very-long-path/that-is-even-longer-than-we-thought")
21 |         });
22 |     });
23 |     c.bench_function("has_placeholders with placeholder", |bencher| {
24 |         bencher.iter(|| {
25 |             has_placeholders(
26 |                 "/tmp/my-very-long-path/that-is-even-longer-than-we-thought/%{TASK_ID}",
27 |             )
28 |         });
29 |     });
30 | }
31 | 
32 | pub fn benchmark_placeholders(c: &mut Criterion) {
33 |     let mut group = c.benchmark_group("placeholder");
34 |     bench_parse_placeholder(&mut group);
35 | }
36 | 
37 | criterion_group!(placeholders, benchmark_placeholders);
38 | 
39 | criterion_main!(placeholders);
40 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/client/autoalloc.rs:
--------------------------------------------------------------------------------
 1 | use crate::server::autoalloc::AllocationState;
 2 | 
 3 | impl AllocationState {
 4 |     pub fn is_failed(&self) -> bool {
 5 |         match self {
 6 |             AllocationState::Finished {
 7 |                 disconnected_workers,
 8 |                 ..
 9 |             } => disconnected_workers.all_crashed(),
10 |             AllocationState::FinishedUnexpectedly { failed, .. } => *failed,
11 |             AllocationState::Queued { .. } | AllocationState::Running { .. } => false,
12 |         }
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/client/commands/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod autoalloc;
 2 | pub mod data;
 3 | pub mod doc;
 4 | pub mod job;
 5 | pub mod journal;
 6 | pub mod outputlog;
 7 | pub mod server;
 8 | pub mod submit;
 9 | pub mod wait;
10 | pub mod worker;
11 | 
12 | /// Helper macro for generating CLI help for a `Duration` (or `Option<Duration>`) value
13 | /// that can be specified either using the HMS or humantime formats.
14 | macro_rules! duration_doc {
15 |     ($text:expr) => {
16 |         concat!(
17 |             $text,
18 |             "\n\n",
19 |             r#"You can use either the `HH:MM:SS` format or a "humantime" format.
20 | For example:
21 | - 01:00:00 => 1 hour
22 | - 02:05:10 => 2 hours, 5 minutes, 10 seconds
23 | - 1h => 1 hour
24 | - 2h5m10s => 2 hours, 5 minutes, 10 seconds
25 | - 3h 10m 5s => 3 hours, 10 minutes, 5 seconds
26 | - 2 hours 5 minutes => 2 hours, 5 minutes"#
27 |         )
28 |     };
29 | }
30 | 
31 | pub(crate) use duration_doc;
32 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/client/commands/submit/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod command;
 2 | pub mod defs;
 3 | pub mod directives;
 4 | mod jobfile;
 5 | 
 6 | pub use command::SubmitJobTaskConfOpts;
 7 | pub use command::{JobSubmitOpts, submit_computation};
 8 | 
 9 | pub use jobfile::{JobSubmitFileOpts, submit_computation_from_job_file};
10 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/client/globalsettings.rs:
--------------------------------------------------------------------------------
 1 | use crate::client::output::outputs::Output;
 2 | use std::path::{Path, PathBuf};
 3 | 
 4 | pub struct GlobalSettings {
 5 |     server_dir: PathBuf,
 6 |     printer: Box<dyn Output>,
 7 | }
 8 | 
 9 | impl GlobalSettings {
10 |     pub fn new(server_dir: PathBuf, printer: Box<dyn Output>) -> Self {
11 |         GlobalSettings {
12 |             server_dir,
13 |             printer,
14 |         }
15 |     }
16 | 
17 |     pub fn server_directory(&self) -> &Path {
18 |         &self.server_dir
19 |     }
20 | 
21 |     pub fn printer(&self) -> &dyn Output {
22 |         self.printer.as_ref()
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/client/job.rs:
--------------------------------------------------------------------------------
 1 | use crate::rpc_call;
 2 | use crate::transfer::connection::ClientSession;
 3 | use crate::transfer::messages::{FromClientMessage, ToClientMessage};
 4 | use tako::{Map, WorkerId};
 5 | 
 6 | /// Maps worker IDs to hostnames.
 7 | pub type WorkerMap = Map<WorkerId, String>;
 8 | 
 9 | pub async fn get_worker_map(session: &mut ClientSession) -> anyhow::Result<WorkerMap> {
10 |     let message = FromClientMessage::WorkerList;
11 |     let response =
12 |         rpc_call!(session.connection(), message, ToClientMessage::WorkerListResponse(r) => r)
13 |             .await?;
14 |     let map = response
15 |         .workers
16 |         .into_iter()
17 |         .map(|w| (w.id, w.configuration.hostname))
18 |         .collect();
19 |     Ok(map)
20 | }
21 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/client/mod.rs:
--------------------------------------------------------------------------------
 1 | use std::path::PathBuf;
 2 | 
 3 | pub mod autoalloc;
 4 | pub mod commands;
 5 | pub mod globalsettings;
 6 | pub mod job;
 7 | pub mod output;
 8 | pub mod resources;
 9 | pub mod server;
10 | pub mod status;
11 | pub mod task;
12 | pub mod utils;
13 | 
14 | pub fn default_server_directory_path() -> PathBuf {
15 |     let mut home = dirs::home_dir().unwrap_or_else(std::env::temp_dir);
16 |     home.push(".hq-server");
17 |     home
18 | }
19 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/client/output/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod cli;
2 | mod common;
3 | pub use common::{Verbosity, VerbosityFlag, resolve_task_paths};
4 | pub mod json;
5 | pub mod outputs;
6 | pub mod quiet;
7 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/client/server.rs:
--------------------------------------------------------------------------------
1 | use crate::transfer::connection::ClientConnection;
2 | use crate::transfer::messages::FromClientMessage;
3 | 
4 | pub async fn client_stop_server(connection: &mut ClientConnection) -> crate::Result<()> {
5 |     connection.send(FromClientMessage::Stop).await?;
6 |     log::info!("Stopping server");
7 |     Ok(())
8 | }
9 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/client/status.rs:
--------------------------------------------------------------------------------
 1 | use serde::Deserialize;
 2 | use serde::Serialize;
 3 | 
 4 | use crate::server::job::JobTaskState;
 5 | use crate::transfer::messages::JobInfo;
 6 | 
 7 | #[derive(clap::ValueEnum, Clone, Copy, Serialize, Deserialize, Debug, Eq, PartialEq, Hash)]
 8 | pub enum Status {
 9 |     Waiting,
10 |     Running,
11 |     Finished,
12 |     Failed,
13 |     Canceled,
14 |     Opened,
15 | }
16 | 
17 | pub fn job_status(info: &JobInfo) -> Status {
18 |     let has_waiting = info.counters.n_waiting_tasks(info.n_tasks) > 0;
19 | 
20 |     if info.counters.n_running_tasks > 0 {
21 |         Status::Running
22 |     } else if has_waiting {
23 |         Status::Waiting
24 |     } else if info.counters.n_failed_tasks > 0 {
25 |         Status::Failed
26 |     } else if info.counters.n_canceled_tasks > 0 {
27 |         Status::Canceled
28 |     } else {
29 |         assert_eq!(info.counters.n_finished_tasks, info.n_tasks);
30 |         if info.is_open {
31 |             Status::Opened
32 |         } else {
33 |             Status::Finished
34 |         }
35 |     }
36 | }
37 | 
38 | pub fn is_terminated(info: &JobInfo) -> bool {
39 |     info.counters.n_running_tasks == 0 && info.counters.n_waiting_tasks(info.n_tasks) == 0
40 | }
41 | 
42 | #[inline]
43 | pub fn get_task_status(status: &JobTaskState) -> Status {
44 |     match status {
45 |         JobTaskState::Waiting => Status::Waiting,
46 |         JobTaskState::Running { .. } => Status::Running,
47 |         JobTaskState::Finished { .. } => Status::Finished,
48 |         JobTaskState::Failed { .. } => Status::Failed,
49 |         JobTaskState::Canceled { .. } => Status::Canceled,
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/client/utils.rs:
--------------------------------------------------------------------------------
 1 | use clap::builder::TypedValueParser;
 2 | use clap::{Command, Error};
 3 | use std::ffi::OsStr;
 4 | 
 5 | #[macro_export]
 6 | macro_rules! rpc_call {
 7 |     ($conn:expr, $message:expr, $matcher:pat $(=> $result:expr)?) => {
 8 |         async {
 9 |             match $conn.send_and_receive($message).await? {
10 |                 $matcher => $crate::Result::Ok(($($result),*)),
11 |                 $crate::transfer::messages::ToClientMessage::Error(e) => {
12 |                     $crate::common::error::error(format!("{}", e))
13 |                 }
14 |                 msg => {
15 |                     $crate::common::error::error(format!("Received an invalid message {:?}", msg))
16 |                 }
17 |             }
18 |         }
19 |     };
20 | }
21 | 
22 | /// This argument checks that the input can be parsed as `Arg`.
23 | /// If it is, it will return the original input from the command line as a [`String`] along with the
24 | /// parsed value.
25 | #[derive(Debug, Clone)]
26 | pub struct PassThroughArgument<Arg>(String, Arg);
27 | 
28 | impl<Arg> PassThroughArgument<Arg> {
29 |     pub fn into_original_input(self) -> String {
30 |         self.0
31 |     }
32 | 
33 |     pub fn as_parsed_arg(&self) -> &Arg {
34 |         &self.1
35 |     }
36 | 
37 |     pub fn into_parsed_arg(self) -> Arg {
38 |         self.1
39 |     }
40 | }
41 | 
42 | #[derive(Clone)]
43 | pub struct PassthroughParser<Arg>(fn(&str) -> anyhow::Result<Arg>);
44 | 
45 | /// Creates a new parser that passed the original value through, while checking that `Arg`
46 | /// can be parsed successfully.
47 | pub fn passthrough_parser<Arg>(parser: fn(&str) -> anyhow::Result<Arg>) -> PassthroughParser<Arg> {
48 |     PassthroughParser(parser)
49 | }
50 | 
51 | impl<Arg: Clone + Sync + Send + 'static> TypedValueParser for PassthroughParser<Arg> {
52 |     type Value = PassThroughArgument<Arg>;
53 | 
54 |     fn parse_ref(
55 |         &self,
56 |         cmd: &Command,
57 |         arg: Option<&clap::Arg>,
58 |         value: &OsStr,
59 |     ) -> Result<Self::Value, Error> {
60 |         self.0
61 |             .parse_ref(cmd, arg, value)
62 |             .map(|parsed| PassThroughArgument(value.to_string_lossy().to_string(), parsed))
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/common/env.rs:
--------------------------------------------------------------------------------
 1 | use bstr::BString;
 2 | 
 3 | const HQ_ENV_PREFIX: &str = "HQ_";
 4 | 
 5 | macro_rules! create_hq_env {
 6 |     ($name: literal) => {
 7 |         concat!("HQ_", $name)
 8 |     };
 9 | }
10 | 
11 | pub fn is_hq_env(name: &BString) -> bool {
12 |     name.starts_with(HQ_ENV_PREFIX.as_bytes())
13 | }
14 | 
15 | /// Known environment variables
16 | pub const HQ_JOB_ID: &str = create_hq_env!("JOB_ID");
17 | pub const HQ_TASK_ID: &str = create_hq_env!("TASK_ID");
18 | pub const HQ_INSTANCE_ID: &str = create_hq_env!("INSTANCE_ID");
19 | pub const HQ_SUBMIT_DIR: &str = create_hq_env!("SUBMIT_DIR");
20 | pub const HQ_ENTRY: &str = create_hq_env!("ENTRY");
21 | pub const HQ_PIN: &str = create_hq_env!("PIN");
22 | pub const HQ_TASK_DIR: &str = create_hq_env!("TASK_DIR");
23 | pub const HQ_ERROR_FILENAME: &str = create_hq_env!("ERROR_FILENAME");
24 | pub const HQ_CPUS: &str = create_hq_env!("CPUS");
25 | pub const HQ_NODE_FILE: &str = create_hq_env!("NODE_FILE");
26 | pub const HQ_HOST_FILE: &str = create_hq_env!("HOST_FILE");
27 | pub const HQ_NUM_NODES: &str = create_hq_env!("NUM_NODES");
28 | pub const HQ_DATA_ACCESS: &str = create_hq_env!("DATA_ACCESS");
29 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/common/error.rs:
--------------------------------------------------------------------------------
 1 | use thiserror::Error;
 2 | 
 3 | use crate::common::error::HqError::GenericError;
 4 | 
 5 | #[derive(Debug, Error)]
 6 | pub enum HqError {
 7 |     #[error(transparent)]
 8 |     IoError(#[from] std::io::Error),
 9 |     #[error("Serialization error: {0}")]
10 |     SerializationError(String),
11 |     #[error("Deserialization error: {0}")]
12 |     DeserializationError(String),
13 |     #[error("Tako error: {0}")]
14 |     TakoError(#[from] tako::Error),
15 |     #[error("Version error: {0}")]
16 |     VersionError(String),
17 |     #[error("Error: {0}")]
18 |     GenericError(String),
19 | }
20 | 
21 | impl From<serde_json::error::Error> for HqError {
22 |     fn from(e: serde_json::error::Error) -> Self {
23 |         Self::SerializationError(e.to_string())
24 |     }
25 | }
26 | 
27 | impl From<bincode::Error> for HqError {
28 |     fn from(e: bincode::Error) -> Self {
29 |         Self::SerializationError(e.to_string())
30 |     }
31 | }
32 | 
33 | impl From<anyhow::Error> for HqError {
34 |     fn from(error: anyhow::Error) -> Self {
35 |         Self::GenericError(error.to_string())
36 |     }
37 | }
38 | 
39 | impl From<toml::de::Error> for HqError {
40 |     fn from(error: toml::de::Error) -> Self {
41 |         Self::DeserializationError(error.to_string())
42 |     }
43 | }
44 | 
45 | pub fn error<T>(message: String) -> crate::Result<T> {
46 |     Err(GenericError(message))
47 | }
48 | 
49 | impl From<String> for HqError {
50 |     fn from(e: String) -> Self {
51 |         GenericError(e)
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/common/idcounter.rs:
--------------------------------------------------------------------------------
 1 | #[derive(Copy, Clone, Default, Debug)]
 2 | pub struct IdCounter {
 3 |     counter: u32,
 4 | }
 5 | impl IdCounter {
 6 |     pub fn new(initial_value: u32) -> Self {
 7 |         Self {
 8 |             counter: initial_value,
 9 |         }
10 |     }
11 |     pub fn increment(&mut self) -> u32 {
12 |         let value = self.counter;
13 |         self.counter += 1;
14 |         value
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/common/manager/common.rs:
--------------------------------------------------------------------------------
 1 | use std::time::Duration;
 2 | 
 3 | /// Format a duration as a PBS/Slurm time string, e.g. 01:05:02
 4 | pub(super) fn format_duration(duration: &Duration) -> String {
 5 |     let mut seconds = duration.as_secs();
 6 |     let hours = seconds / 3600;
 7 |     seconds %= 3600;
 8 |     let minutes = seconds / 60;
 9 |     seconds %= 60;
10 |     format!("{hours:02}:{minutes:02}:{seconds:02}")
11 | }
12 | 
13 | #[cfg(test)]
14 | mod test {
15 |     use super::format_duration;
16 |     use std::time::Duration;
17 | 
18 |     #[test]
19 |     fn test_format_duration() {
20 |         assert_eq!(format_duration(&Duration::from_secs(0)), "00:00:00");
21 |         assert_eq!(format_duration(&Duration::from_secs(1)), "00:00:01");
22 |         assert_eq!(format_duration(&Duration::from_secs(61)), "00:01:01");
23 |         assert_eq!(format_duration(&Duration::from_secs(3661)), "01:01:01");
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/common/manager/info.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | use std::fmt::{Display, Formatter};
 3 | use std::time::Duration;
 4 | use tako::worker::WorkerConfiguration;
 5 | 
 6 | pub const WORKER_EXTRA_MANAGER_KEY: &str = "JobManager";
 7 | 
 8 | #[derive(Clone, Serialize, Deserialize, Debug, PartialEq)]
 9 | pub enum ManagerType {
10 |     Pbs,
11 |     Slurm,
12 | }
13 | 
14 | impl Display for ManagerType {
15 |     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
16 |         match self {
17 |             ManagerType::Pbs => f.write_str("PBS"),
18 |             ManagerType::Slurm => f.write_str("SLURM"),
19 |         }
20 |     }
21 | }
22 | 
23 | #[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
24 | pub struct ManagerInfo {
25 |     pub manager: ManagerType,
26 |     pub allocation_id: String,
27 |     /// Time that remains until the job ends
28 |     pub time_limit: Option<Duration>,
29 | }
30 | 
31 | impl ManagerInfo {
32 |     pub fn new(manager: ManagerType, job_id: String, time_limit: Option<Duration>) -> Self {
33 |         Self {
34 |             manager,
35 |             allocation_id: job_id,
36 |             time_limit,
37 |         }
38 |     }
39 | }
40 | 
41 | pub trait GetManagerInfo {
42 |     fn get_manager_info(&self) -> Option<ManagerInfo>;
43 | }
44 | 
45 | impl GetManagerInfo for WorkerConfiguration {
46 |     fn get_manager_info(&self) -> Option<ManagerInfo> {
47 |         self.extra
48 |             .get(WORKER_EXTRA_MANAGER_KEY)
49 |             .and_then(|info| serde_json::from_str(info).ok())
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/common/manager/mod.rs:
--------------------------------------------------------------------------------
1 | mod common;
2 | pub mod info;
3 | pub mod pbs;
4 | pub mod slurm;
5 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/common/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod arraydef;
 2 | pub mod arrayparser;
 3 | pub mod cli;
 4 | pub mod env;
 5 | pub mod error;
 6 | pub mod format;
 7 | pub mod idcounter;
 8 | pub mod manager;
 9 | pub mod parser;
10 | pub mod parser2;
11 | pub mod placeholders;
12 | pub mod rpc;
13 | pub mod serialization;
14 | pub mod serverdir;
15 | pub mod setup;
16 | pub mod utils;
17 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/common/rpc.rs:
--------------------------------------------------------------------------------
 1 | use std::fmt::{Debug, Formatter};
 2 | use tokio::sync::oneshot::Receiver;
 3 | use tokio::sync::{mpsc, oneshot};
 4 | 
 5 | /// Can be used to respond to a RPC call.
 6 | #[must_use = "response token should be used to respond to a request"]
 7 | pub struct ResponseToken<T> {
 8 |     sender: oneshot::Sender<T>,
 9 | }
10 | 
11 | impl<T> Debug for ResponseToken<T> {
12 |     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
13 |         f.write_str("Response token")
14 |     }
15 | }
16 | 
17 | impl<T> ResponseToken<T> {
18 |     pub fn new() -> (ResponseToken<T>, Receiver<T>) {
19 |         let (tx, rx) = oneshot::channel::<T>();
20 |         (Self { sender: tx }, rx)
21 |     }
22 | 
23 |     pub fn respond(self, response: T) {
24 |         if let Err(_e) = self.sender.send(response) {
25 |             log::warn!("Could not send response to RPC method, the other end hang up");
26 |         }
27 |     }
28 | }
29 | 
30 | /// Helper function for creating request-response RPC calls.
31 | /// Expects a callback that will receive a response token.
32 | /// This function will return once the response token has been resolved.
33 | pub fn initiate_request<F, Response, R>(make_request: F) -> oneshot::Receiver<Response>
34 | where
35 |     F: FnOnce(ResponseToken<Response>) -> Result<(), mpsc::error::SendError<R>>,
36 |     R: std::fmt::Debug,
37 | {
38 |     let (token, rx) = ResponseToken::new();
39 |     if let Err(error) = make_request(token) {
40 |         log::warn!("Could not make RPC request: {error:?}");
41 |     }
42 |     rx
43 | }
44 | 
45 | pub type RpcSender<T> = mpsc::UnboundedSender<T>;
46 | pub type RpcReceiver<T> = mpsc::UnboundedReceiver<T>;
47 | 
48 | pub fn make_rpc_queue<T>() -> (RpcSender<T>, RpcReceiver<T>) {
49 |     mpsc::unbounded_channel()
50 | }
51 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/common/serialization.rs:
--------------------------------------------------------------------------------
 1 | use bincode::Options;
 2 | use serde::de::DeserializeOwned;
 3 | use serde::{Deserialize, Serialize};
 4 | use std::fmt::{Debug, Formatter};
 5 | use std::marker::PhantomData;
 6 | 
 7 | /// Helper trait to configure serialization options via separate types.
 8 | pub trait SerializationConfig {
 9 |     fn config() -> impl Options;
10 | }
11 | 
12 | pub struct DefaultConfig;
13 | 
14 | impl SerializationConfig for DefaultConfig {
15 |     fn config() -> impl Options {
16 |         bincode::DefaultOptions::new().with_limit(tako::MAX_FRAME_SIZE as u64)
17 |     }
18 | }
19 | 
20 | pub struct TrailingAllowedConfig;
21 | 
22 | impl SerializationConfig for TrailingAllowedConfig {
23 |     fn config() -> impl Options {
24 |         bincode::DefaultOptions::new()
25 |             .allow_trailing_bytes()
26 |             .with_limit(tako::MAX_FRAME_SIZE as u64)
27 |     }
28 | }
29 | 
30 | /// Strongly typed wrapper over `<T>` serialized with Bincode.
31 | #[derive(Serialize, Deserialize)]
32 | pub struct Serialized<T, C = DefaultConfig> {
33 |     #[serde(with = "serde_bytes")]
34 |     data: Box<[u8]>,
35 |     _phantom: PhantomData<(T, C)>,
36 | }
37 | 
38 | impl<T, C> Debug for Serialized<T, C> {
39 |     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
40 |         write!(
41 |             f,
42 |             "Serialized {} ({}) byte(s)",
43 |             std::any::type_name::<T>(),
44 |             self.data.len()
45 |         )
46 |     }
47 | }
48 | 
49 | impl<T, C> Clone for Serialized<T, C> {
50 |     fn clone(&self) -> Self {
51 |         Self {
52 |             data: self.data.clone(),
53 |             _phantom: PhantomData,
54 |         }
55 |     }
56 | }
57 | 
58 | impl<T: Serialize + DeserializeOwned, C: SerializationConfig> Serialized<T, C> {
59 |     pub fn new(value: &T) -> bincode::Result<Self> {
60 |         let result = C::config().serialize(value)?;
61 |         // Check that we're not reallocating needlessly in `into_boxed_slice`
62 |         debug_assert_eq!(result.capacity(), result.len());
63 |         Ok(Self {
64 |             data: result.into_boxed_slice(),
65 |             _phantom: Default::default(),
66 |         })
67 |     }
68 | 
69 |     pub fn deserialize(&self) -> bincode::Result<T> {
70 |         C::config().deserialize(&self.data)
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/common/setup.rs:
--------------------------------------------------------------------------------
 1 | use env_logger::DEFAULT_FILTER_ENV;
 2 | use env_logger::fmt::style::{AnsiColor, Color, Style};
 3 | use log::LevelFilter;
 4 | use std::io::Write;
 5 | 
 6 | /// Sets the behavior of the logger, based on passed environment variables
 7 | /// such as `RUST_LOG`.
 8 | pub fn setup_logging(verbose: bool) {
 9 |     let mut builder = env_logger::Builder::default();
10 |     builder.filter_level(if verbose {
11 |         LevelFilter::Debug
12 |     } else {
13 |         LevelFilter::Info
14 |     });
15 | 
16 |     let has_debug = std::env::var(DEFAULT_FILTER_ENV)
17 |         .map(|v| v.contains("debug"))
18 |         .unwrap_or(false);
19 | 
20 |     if verbose || has_debug {
21 |         builder.format_timestamp_millis();
22 |     } else {
23 |         // Shortened format
24 |         // <time> <level> <message>
25 |         builder.format(|buf, record| {
26 |             let level_style = buf.default_level_style(record.level()).bold();
27 |             let style = Style::new().fg_color(Some(Color::Ansi(AnsiColor::Black)));
28 |             let timestamp = buf.timestamp_seconds();
29 |             let level = record.level();
30 |             let args = record.args();
31 |             writeln!(
32 |                 buf,
33 |                 "{style}{timestamp}{style:#} {level_style}{level}{level_style:#} {args}",
34 |             )
35 |         });
36 |     }
37 | 
38 |     // Overwrite the defaults from env
39 |     builder.parse_default_env();
40 |     builder.init();
41 | }
42 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/common/utils/controlflow.rs:
--------------------------------------------------------------------------------
 1 | #[macro_export]
 2 | macro_rules! get_or_return {
 3 |     ($e:expr) => {
 4 |         match $e {
 5 |             Some(v) => v,
 6 |             _ => return,
 7 |         }
 8 |     };
 9 | }
10 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/common/utils/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod controlflow;
2 | pub mod fs;
3 | pub mod network;
4 | pub mod str;
5 | pub mod time;
6 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/common/utils/network.rs:
--------------------------------------------------------------------------------
1 | pub fn get_hostname(preferred: Option<String>) -> String {
2 |     preferred.unwrap_or_else(|| {
3 |         gethostname::gethostname()
4 |             .into_string()
5 |             .expect("Invalid hostname")
6 |     })
7 | }
8 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/common/utils/str.rs:
--------------------------------------------------------------------------------
 1 | use std::borrow::Cow;
 2 | 
 3 | /// Return the input string with an added "s" at the end if `count` is larger than one and non-zero.
 4 | pub fn pluralize(value: &str, count: usize) -> Cow<str> {
 5 |     if count == 1 {
 6 |         Cow::Borrowed(value)
 7 |     } else {
 8 |         Cow::Owned(format!("{value}s"))
 9 |     }
10 | }
11 | 
12 | /// Select `single` variant if `count` is one or `other` variant otherwise.
13 | pub fn select_plural<'a>(single: &'a str, other: &'a str, count: usize) -> Cow<'a, str> {
14 |     if count == 1 {
15 |         Cow::Borrowed(single)
16 |     } else {
17 |         Cow::Borrowed(other)
18 |     }
19 | }
20 | 
21 | /// Truncates the middle of a string so that it's total length doesn't exceed `length`.
22 | /// The returned string will never exceed `length`.
23 | /// `length` has to be at least five, otherwise there wouldn't be space for `...`.
24 | pub fn truncate_middle(value: &str, length: usize) -> Cow<str> {
25 |     assert!(length >= 5);
26 | 
27 |     if value.len() <= length {
28 |         value.into()
29 |     } else {
30 |         let length = length - 3; // space for ...
31 |         let half = length as f64 / 2.0;
32 |         let start = half.ceil() as usize;
33 |         let end = half.floor() as usize;
34 |         format!("{}...{}", &value[..start], &value[value.len() - end..]).into()
35 |     }
36 | }
37 | 
38 | #[cfg(test)]
39 | mod tests {
40 |     use crate::common::utils::str::truncate_middle;
41 |     use std::borrow::Cow;
42 | 
43 |     #[test]
44 |     fn test_truncate_middle_empty() {
45 |         assert_eq!(truncate_middle("", 5), Cow::from(""));
46 |     }
47 | 
48 |     #[test]
49 |     fn test_truncate_middle_length_1() {
50 |         assert_eq!(truncate_middle("a", 5), Cow::from("a"));
51 |     }
52 | 
53 |     #[test]
54 |     fn test_truncate_middle_length_2() {
55 |         assert_eq!(truncate_middle("a", 5), Cow::from("a"));
56 |     }
57 | 
58 |     #[test]
59 |     fn test_truncate_middle_length_5() {
60 |         assert_eq!(truncate_middle("abcde", 5), Cow::from("abcde"));
61 |     }
62 | 
63 |     #[test]
64 |     fn test_truncate_middle_length_6() {
65 |         assert_eq!(truncate_middle("abcdef", 5), Cow::from("a...f"));
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/dashboard/data/fetch.rs:
--------------------------------------------------------------------------------
 1 | use crate::server::event::Event;
 2 | use crate::transfer::connection::ClientSession;
 3 | use crate::transfer::messages::ToClientMessage;
 4 | use std::time::Duration;
 5 | use tokio::sync::mpsc::Sender;
 6 | 
 7 | /// Create an async process that fetches new events from the server and sends them to `sender`.
 8 | /// We assume that `session` has already been configured to receive streamed events from the server.
 9 | pub async fn create_data_fetch_process(
10 |     mut session: ClientSession,
11 |     sender: Sender<Vec<Event>>,
12 | ) -> anyhow::Result<()> {
13 |     const CAPACITY: usize = 1024;
14 | 
15 |     let mut events = Vec::with_capacity(CAPACITY);
16 |     let mut tick = tokio::time::interval(Duration::from_millis(500));
17 | 
18 |     let conn = session.connection();
19 | 
20 |     loop {
21 |         tokio::select! {
22 |             _ = tick.tick() => {
23 |                 if !events.is_empty() {
24 |                     sender.send(events).await?;
25 |                     events = Vec::with_capacity(CAPACITY);
26 |                 }
27 |             }
28 |             // Hopefully this is cancellation safe...
29 |             message = conn.receive() => {
30 |                 let Some(message) = message else { break; };
31 | 
32 |                 let message = message?;
33 |                 match message {
34 |                     ToClientMessage::Event(event) => {
35 |                         events.push(event);
36 |                         if events.len() == CAPACITY {
37 |                             sender.send(events).await?;
38 |                             events = Vec::with_capacity(CAPACITY);
39 |                         }
40 |                     },
41 |                     _ => {
42 |                         return Err(anyhow::anyhow!("Dashboard received unexpected message {message:?}"));
43 |                     }
44 |                 };
45 |             }
46 |         }
47 |     }
48 |     if !events.is_empty() {
49 |         sender.send(events).await?;
50 |     }
51 |     Ok(())
52 | }
53 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/dashboard/data/mod.rs:
--------------------------------------------------------------------------------
 1 | use std::time::SystemTime;
 2 | 
 3 | pub use data::DashboardData;
 4 | pub use data::MIN_TIME_RANGE_DURATION;
 5 | pub use fetch::create_data_fetch_process;
 6 | pub use time_based_vec::ItemWithTime;
 7 | pub use time_interval::TimeMode;
 8 | pub use time_interval::TimeRange;
 9 | 
10 | #[allow(clippy::module_inception)]
11 | mod data;
12 | mod fetch;
13 | mod time_based_vec;
14 | mod time_interval;
15 | pub mod timelines;
16 | 
17 | type Time = SystemTime;
18 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/dashboard/data/timelines/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod alloc_timeline;
2 | pub mod job_timeline;
3 | pub mod worker_timeline;
4 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/dashboard/ui/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod screen;
2 | pub mod screens;
3 | pub mod styles;
4 | pub mod terminal;
5 | pub mod widgets;
6 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/dashboard/ui/screen.rs:
--------------------------------------------------------------------------------
 1 | use crossterm::event::KeyEvent;
 2 | use ratatui::layout::Rect;
 3 | 
 4 | use crate::dashboard::data::DashboardData;
 5 | use crate::dashboard::ui::terminal::DashboardFrame;
 6 | 
 7 | pub trait Screen {
 8 |     fn draw(&mut self, in_area: Rect, frame: &mut DashboardFrame);
 9 |     fn update(&mut self, data: &DashboardData);
10 |     fn handle_key(&mut self, key: KeyEvent);
11 | }
12 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/dashboard/ui/screens/cluster/mod.rs:
--------------------------------------------------------------------------------
 1 | use crate::dashboard::data::DashboardData;
 2 | use crate::dashboard::ui::screen::Screen;
 3 | use crate::dashboard::ui::terminal::DashboardFrame;
 4 | use crossterm::event::{KeyCode, KeyEvent};
 5 | use overview::ClusterOverview;
 6 | use ratatui::layout::Rect;
 7 | use worker::WorkerDetail;
 8 | 
 9 | pub mod overview;
10 | pub mod worker;
11 | 
12 | #[derive(Default)]
13 | pub struct WorkerOverviewScreen {
14 |     cluster_overview: ClusterOverview,
15 |     worker_overview: WorkerDetail,
16 | 
17 |     active_fragment: ScreenState,
18 | }
19 | 
20 | #[derive(Default)]
21 | enum ScreenState {
22 |     #[default]
23 |     ClusterOverview,
24 |     WorkerInfo,
25 | }
26 | 
27 | impl Screen for WorkerOverviewScreen {
28 |     fn draw(&mut self, in_area: Rect, frame: &mut DashboardFrame) {
29 |         match self.active_fragment {
30 |             ScreenState::ClusterOverview => self.cluster_overview.draw(in_area, frame),
31 |             ScreenState::WorkerInfo => self.worker_overview.draw(in_area, frame),
32 |         }
33 |     }
34 | 
35 |     fn update(&mut self, data: &DashboardData) {
36 |         match self.active_fragment {
37 |             ScreenState::ClusterOverview => self.cluster_overview.update(data),
38 |             ScreenState::WorkerInfo => self.worker_overview.update(data),
39 |         }
40 |     }
41 | 
42 |     fn handle_key(&mut self, key: KeyEvent) {
43 |         match self.active_fragment {
44 |             ScreenState::ClusterOverview => self.cluster_overview.handle_key(key),
45 |             ScreenState::WorkerInfo => self.worker_overview.handle_key(key),
46 |         }
47 | 
48 |         match key.code {
49 |             KeyCode::Char('i') => {
50 |                 if let Some(selected_worker) = self.cluster_overview.get_selected_worker() {
51 |                     self.worker_overview.set_worker_id(selected_worker);
52 |                     self.active_fragment = ScreenState::WorkerInfo;
53 |                 }
54 |             }
55 |             KeyCode::Backspace => {
56 |                 self.worker_overview.clear_worker_id();
57 |                 self.active_fragment = ScreenState::ClusterOverview
58 |             }
59 |             _ => {}
60 |         }
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/dashboard/ui/screens/cluster/overview/worker_count_chart.rs:
--------------------------------------------------------------------------------
 1 | use ratatui::layout::Rect;
 2 | use ratatui::style::Color;
 3 | 
 4 | use crate::dashboard::data::{DashboardData, TimeRange};
 5 | use crate::dashboard::ui::terminal::DashboardFrame;
 6 | use crate::dashboard::ui::widgets::chart::{
 7 |     RangeSteps, create_count_chart, create_dataset, get_time_as_secs, y_axis_steps,
 8 | };
 9 | 
10 | #[derive(Default)]
11 | pub struct WorkerCountChart {
12 |     /// Worker count records that should be currently displayed.
13 |     worker_counts: Vec<(f64, f64)>,
14 |     range: TimeRange,
15 | }
16 | 
17 | impl WorkerCountChart {
18 |     pub fn update(&mut self, data: &DashboardData) {
19 |         let range = data.current_time_range();
20 |         let steps = RangeSteps::new(range, 25);
21 |         self.worker_counts = steps
22 |             .map(|time| {
23 |                 (
24 |                     get_time_as_secs(time),
25 |                     data.workers().query_connected_worker_ids_at(time).count() as f64,
26 |                 )
27 |             })
28 |             .collect();
29 |         self.range = range;
30 |     }
31 | 
32 |     pub fn draw(&mut self, rect: Rect, frame: &mut DashboardFrame) {
33 |         let max_workers_in_view = self
34 |             .worker_counts
35 |             .iter()
36 |             .map(|record| record.1 as u64)
37 |             .max()
38 |             .unwrap_or(0)
39 |             .max(4) as f64;
40 | 
41 |         let datasets = vec![create_dataset(
42 |             &self.worker_counts,
43 |             "Connected Workers",
44 |             Color::White,
45 |         )];
46 | 
47 |         let chart = create_count_chart(datasets, "Running Worker Count", self.range)
48 |             .legend_position(None)
49 |             .y_axis(y_axis_steps(0.0, max_workers_in_view, 4));
50 |         frame.render_widget(chart, rect);
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/dashboard/ui/screens/jobs/mod.rs:
--------------------------------------------------------------------------------
 1 | use crate::dashboard::data::DashboardData;
 2 | use crate::dashboard::ui::screen::Screen;
 3 | use crate::dashboard::ui::screens::cluster::worker::WorkerDetail;
 4 | use crate::dashboard::ui::terminal::DashboardFrame;
 5 | use crossterm::event::{KeyCode, KeyEvent};
 6 | use overview::JobOverview;
 7 | use ratatui::layout::Rect;
 8 | 
 9 | mod job_info_display;
10 | mod job_tasks_chart;
11 | mod jobs_table;
12 | mod overview;
13 | 
14 | #[derive(Default)]
15 | pub struct JobScreen {
16 |     job_overview: JobOverview,
17 |     task_worker_detail: WorkerDetail,
18 | 
19 |     active_fragment: ScreenState,
20 | }
21 | 
22 | #[derive(Default)]
23 | enum ScreenState {
24 |     #[default]
25 |     JobInfo,
26 |     TaskWorkerDetail,
27 | }
28 | 
29 | impl Screen for JobScreen {
30 |     fn draw(&mut self, in_area: Rect, frame: &mut DashboardFrame) {
31 |         match self.active_fragment {
32 |             ScreenState::JobInfo => self.job_overview.draw(in_area, frame),
33 |             ScreenState::TaskWorkerDetail => self.task_worker_detail.draw(in_area, frame),
34 |         }
35 |     }
36 | 
37 |     fn update(&mut self, data: &DashboardData) {
38 |         match self.active_fragment {
39 |             ScreenState::JobInfo => self.job_overview.update(data),
40 |             ScreenState::TaskWorkerDetail => self.task_worker_detail.update(data),
41 |         }
42 |     }
43 | 
44 |     fn handle_key(&mut self, key: KeyEvent) {
45 |         match self.active_fragment {
46 |             ScreenState::JobInfo => self.job_overview.handle_key(key),
47 |             ScreenState::TaskWorkerDetail => self.task_worker_detail.handle_key(key),
48 |         }
49 | 
50 |         match key.code {
51 |             KeyCode::Char('i') => {
52 |                 if let Some((_, selected_worker)) = self.job_overview.get_selected_task() {
53 |                     self.task_worker_detail.set_worker_id(selected_worker);
54 |                     self.active_fragment = ScreenState::TaskWorkerDetail;
55 |                 }
56 |             }
57 |             KeyCode::Backspace => {
58 |                 self.task_worker_detail.clear_worker_id();
59 |                 self.active_fragment = ScreenState::JobInfo
60 |             }
61 |             _ => {}
62 |         }
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/dashboard/ui/screens/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod autoalloc;
2 | pub mod cluster;
3 | pub mod jobs;
4 | pub mod root_screen;
5 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/dashboard/ui/styles.rs:
--------------------------------------------------------------------------------
 1 | use ratatui::{
 2 |     style::{Color, Modifier, Style},
 3 |     text::{Line, Span},
 4 |     widgets::{Block, Borders, Row},
 5 | };
 6 | 
 7 | /// The Style associated with the hyperqueue logo text
 8 | pub fn style_header_text() -> Style {
 9 |     Style::default()
10 |         .fg(Color::White)
11 |         .bg(Color::Black)
12 |         .add_modifier(Modifier::BOLD)
13 | }
14 | 
15 | /// The Style for the footer text on fragments
16 | pub fn style_footer() -> Style {
17 |     Style::default().fg(Color::White).bg(Color::Black)
18 | }
19 | 
20 | pub fn style_table_title() -> Style {
21 |     Style::default()
22 |         .fg(Color::White)
23 |         .bg(Color::Black)
24 |         .add_modifier(Modifier::BOLD)
25 | }
26 | 
27 | pub fn table_title<'a>(part_1: String) -> Line<'a> {
28 |     Line::from(vec![Span::styled(part_1, style_table_title())])
29 | }
30 | 
31 | pub fn table_block_with_title(title: Line) -> Block {
32 |     Block::default()
33 |         .borders(Borders::ALL)
34 |         .style(Style::default().bg(Color::Black))
35 |         .title(title)
36 | }
37 | 
38 | pub fn style_column_headers(cells: Vec<&str>) -> Row {
39 |     Row::new(cells).style(
40 |         Style::default()
41 |             .fg(Color::Yellow)
42 |             .bg(Color::Black)
43 |             .add_modifier(Modifier::BOLD),
44 |     )
45 | }
46 | 
47 | pub fn style_table_highlight() -> Style {
48 |     Style::default()
49 |         .add_modifier(Modifier::BOLD)
50 |         .bg(Color::Yellow)
51 |         .fg(Color::Black)
52 | }
53 | 
54 | pub fn chart_style_deselected() -> Style {
55 |     Style::default().fg(Color::Gray).bg(Color::Black)
56 | }
57 | 
58 | pub fn table_style_deselected() -> Style {
59 |     Style::default().fg(Color::Gray).bg(Color::Black)
60 | }
61 | 
62 | pub fn table_style_selected() -> Style {
63 |     Style::default().fg(Color::Yellow).bg(Color::Black)
64 | }
65 | 
66 | pub fn style_no_data() -> Style {
67 |     Style::default().fg(Color::Magenta).bg(Color::Black)
68 | }
69 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/dashboard/ui/terminal.rs:
--------------------------------------------------------------------------------
 1 | use ratatui::{DefaultTerminal, Frame};
 2 | 
 3 | pub type DashboardTerminal = DefaultTerminal;
 4 | pub type DashboardFrame<'a> = Frame<'a>;
 5 | 
 6 | pub fn initialize_terminal() -> std::io::Result<DashboardTerminal> {
 7 |     let mut terminal = ratatui::init();
 8 |     terminal.clear()?;
 9 |     Ok(terminal)
10 | }
11 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/dashboard/ui/widgets/filled_rectangle.rs:
--------------------------------------------------------------------------------
 1 | use ratatui::prelude::Color;
 2 | use ratatui::widgets::canvas::{Line, Painter, Shape};
 3 | 
 4 | #[derive(Debug)]
 5 | pub struct FilledRectangle {
 6 |     /// The `x` position of the rectangle.
 7 |     ///
 8 |     /// The rectangle is positioned from its bottom left corner.
 9 |     pub x: f64,
10 |     /// The `y` position of the rectangle.
11 |     ///
12 |     /// The rectangle is positioned from its bottom left corner.
13 |     pub y: f64,
14 |     /// The width of the rectangle.
15 |     pub width: f64,
16 |     /// The height of the rectangle.
17 |     pub height: f64,
18 |     /// The color of the rectangle.
19 |     pub color: Color,
20 | }
21 | 
22 | impl Shape for FilledRectangle {
23 |     fn draw(&self, painter: &mut Painter) {
24 |         let mut value = self.y;
25 |         while value < self.y + self.height {
26 |             Line {
27 |                 x1: self.x,
28 |                 x2: self.x + self.width,
29 |                 y1: value,
30 |                 y2: value,
31 |                 color: self.color,
32 |             }
33 |             .draw(painter);
34 |             value += 1.0;
35 |         }
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/dashboard/ui/widgets/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod chart;
2 | mod filled_rectangle;
3 | pub mod progressbar;
4 | pub mod table;
5 | pub mod tasks_table;
6 | pub mod text;
7 | 
8 | pub use filled_rectangle::FilledRectangle;
9 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/dashboard/ui/widgets/text.rs:
--------------------------------------------------------------------------------
 1 | use crate::dashboard::ui::terminal::DashboardFrame;
 2 | use ratatui::layout::{Alignment, Rect};
 3 | use ratatui::style::Style;
 4 | use ratatui::text::Line;
 5 | use ratatui::widgets::{Block, Paragraph, Wrap};
 6 | 
 7 | pub fn draw_text(text: &str, rect: Rect, frame: &mut DashboardFrame, text_style: Style) {
 8 |     let header_text = vec![Line::from(text)];
 9 |     let paragraph = Paragraph::new(header_text)
10 |         .style(text_style)
11 |         .block(Block::default())
12 |         .alignment(Alignment::Left)
13 |         .wrap(Wrap { trim: true });
14 |     frame.render_widget(paragraph, rect);
15 | }
16 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/dashboard/utils.rs:
--------------------------------------------------------------------------------
 1 | use tako::hwstats::{MemoryStats, WorkerHwStateMessage};
 2 | 
 3 | pub fn get_memory_usage_pct(memory_stats: &MemoryStats) -> u64 {
 4 |     if memory_stats.total == 0 {
 5 |         return 0;
 6 |     }
 7 |     (((memory_stats.total - memory_stats.free) as f64 / (memory_stats.total as f64)) * 100.00)
 8 |         as u64
 9 | }
10 | 
11 | pub fn get_average_cpu_usage_for_worker(hw_state: &WorkerHwStateMessage) -> f64 {
12 |     let num_cpus = hw_state
13 |         .state
14 |         .cpu_usage
15 |         .cpu_per_core_percent_usage
16 |         .len()
17 |         .max(1);
18 |     let cpu_usage_sum_per_core = hw_state
19 |         .state
20 |         .cpu_usage
21 |         .cpu_per_core_percent_usage
22 |         .iter()
23 |         .copied()
24 |         .reduce(|cpu_a, cpu_b| (cpu_a + cpu_b))
25 |         .unwrap_or(0.0) as f64;
26 |     cpu_usage_sum_per_core / num_cpus as f64
27 | }
28 | 
29 | pub fn calculate_average(items: &[f64]) -> f64 {
30 |     items.iter().sum::<f64>() / (items.len().max(1)) as f64
31 | }
32 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![deny(clippy::await_holding_refcell_ref)]
 2 | 
 3 | pub mod client;
 4 | pub mod common;
 5 | #[cfg(feature = "dashboard")]
 6 | pub mod dashboard;
 7 | pub mod server;
 8 | pub mod stream;
 9 | pub mod transfer;
10 | pub mod worker;
11 | 
12 | #[cfg(test)]
13 | pub(crate) mod tests;
14 | 
15 | use serde::{Deserialize, Serialize};
16 | 
17 | pub type Error = crate::common::error::HqError;
18 | pub type Result<T> = std::result::Result<T, Error>;
19 | 
20 | // ID types
21 | use tako::{JobId, JobTaskId};
22 | 
23 | #[derive(Debug, Clone, Deserialize, Serialize)]
24 | pub struct JobDataObjectId {
25 |     pub task_id: JobTaskId,
26 |     pub output_id: OutputId,
27 | }
28 | 
29 | impl JobDataObjectId {
30 |     pub fn to_dataobj_id(&self, job_id: JobId) -> DataObjectId {
31 |         DataObjectId::new(tako::TaskId::new(job_id, self.task_id), self.output_id)
32 |     }
33 | }
34 | 
35 | pub const DEFAULT_WORKER_GROUP_NAME: &str = "default";
36 | 
37 | // Reexports
38 | pub use tako;
39 | pub use tako::WrappedRcRefCell;
40 | use tako::datasrv::{DataObjectId, OutputId};
41 | 
42 | pub const HQ_VERSION: &str = {
43 |     match option_env!("HQ_BUILD_VERSION") {
44 |         Some(version) => version,
45 |         None => const_format::concatcp!(env!("CARGO_PKG_VERSION"), "-dev"),
46 |     }
47 | };
48 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/server/autoalloc/mod.rs:
--------------------------------------------------------------------------------
 1 | //! This module controls autoalloc (automatic allocation), a background process that queues jobs
 2 | //! into PBS/Slurm in response to user requirements and task workload to provide more workers for
 3 | //! the HQ runtime.
 4 | //!
 5 | //! The term `allocation` represents a PBS/Slurm job in this module, to distinguish itself from
 6 | //! HQ jobs.
 7 | mod config;
 8 | mod process;
 9 | mod queue;
10 | mod service;
11 | mod state;
12 | 
13 | pub type AutoAllocResult<T> = anyhow::Result<T>;
14 | 
15 | pub use process::try_submit_allocation;
16 | pub use queue::{QueueInfo, QueueParameters};
17 | pub use service::{AutoAllocService, LostWorkerDetails, create_autoalloc_service};
18 | pub use state::{Allocation, AllocationId, AllocationState, QueueId};
19 | 
20 | #[cfg(test)]
21 | pub use service::tests::test_alloc_service;
22 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/server/event/journal/mod.rs:
--------------------------------------------------------------------------------
 1 | mod prune;
 2 | mod read;
 3 | mod stream;
 4 | mod write;
 5 | 
 6 | pub use read::JournalReader;
 7 | use serde::{Deserialize, Serialize};
 8 | pub use stream::{EventStreamMessage, EventStreamSender, start_event_streaming};
 9 | pub use write::JournalWriter;
10 | 
11 | const HQ_JOURNAL_HEADER: &[u8] = b"hqjl0002";
12 | 
13 | const HQ_JOURNAL_VERSION_MAJOR: u32 = 23;
14 | const HQ_JOURNAL_VERSION_MINOR: u32 = 0;
15 | 
16 | #[derive(Serialize, Deserialize)]
17 | struct JournalVersion {
18 |     major: u32,
19 |     minor: u32,
20 | }
21 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/server/event/journal/prune.rs:
--------------------------------------------------------------------------------
 1 | use crate::server::event::journal::{JournalReader, JournalWriter};
 2 | use crate::server::event::payload::EventPayload;
 3 | use tako::JobId;
 4 | use tako::{Set, WorkerId};
 5 | 
 6 | pub(crate) fn prune_journal(
 7 |     reader: &mut JournalReader,
 8 |     writer: &mut JournalWriter,
 9 |     live_job_ids: &Set<JobId>,
10 |     live_worker_ids: &Set<WorkerId>,
11 | ) -> crate::Result<()> {
12 |     for event in reader {
13 |         let mut event = event?;
14 |         let event = match &mut event.payload {
15 |             EventPayload::WorkerConnected(worker_id, _)
16 |             | EventPayload::WorkerLost(worker_id, _) => {
17 |                 live_worker_ids.contains(worker_id).then_some(event)
18 |             }
19 |             EventPayload::WorkerOverviewReceived(overview) => {
20 |                 live_worker_ids.contains(&overview.id).then_some(event)
21 |             }
22 |             EventPayload::Submit { job_id, .. }
23 |             | EventPayload::JobCompleted(job_id)
24 |             | EventPayload::JobOpen(job_id, _)
25 |             | EventPayload::JobClose(job_id) => live_job_ids.contains(job_id).then_some(event),
26 |             EventPayload::TaskStarted { task_id, .. }
27 |             | EventPayload::TaskFinished { task_id, .. }
28 |             | EventPayload::TaskFailed { task_id, .. } => {
29 |                 live_job_ids.contains(&task_id.job_id()).then_some(event)
30 |             }
31 |             EventPayload::TasksCanceled { task_ids, .. } => {
32 |                 task_ids.retain(|id| live_job_ids.contains(&id.job_id()));
33 |                 (!task_ids.is_empty()).then_some(event)
34 |             }
35 |             EventPayload::AllocationQueueCreated(_, _)
36 |             | EventPayload::AllocationQueueRemoved(_)
37 |             | EventPayload::AllocationQueued { .. }
38 |             | EventPayload::AllocationStarted(_, _)
39 |             | EventPayload::AllocationFinished(_, _)
40 |             | EventPayload::ServerStart { .. }
41 |             | EventPayload::ServerStop => Some(event),
42 |         };
43 |         if let Some(event) = event {
44 |             writer.store(event)?;
45 |         }
46 |     }
47 |     Ok(())
48 | }
49 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/server/event/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod journal;
 2 | pub mod payload;
 3 | pub mod streamer;
 4 | 
 5 | use crate::stream::StreamSerializationConfig;
 6 | use chrono::serde::ts_milliseconds;
 7 | use chrono::{DateTime, Utc};
 8 | use payload::EventPayload;
 9 | use serde::{Deserialize, Serialize};
10 | 
11 | pub type EventId = u32;
12 | 
13 | type EventSerializationConfig = StreamSerializationConfig;
14 | 
15 | #[derive(Serialize, Deserialize, Debug, Clone)]
16 | pub struct Event {
17 |     #[serde(with = "ts_milliseconds")]
18 |     pub time: DateTime<Utc>,
19 |     pub payload: EventPayload,
20 | }
21 | 
22 | impl Event {
23 |     pub fn at(time: DateTime<Utc>, payload: EventPayload) -> Self {
24 |         Self { time, payload }
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/server/mod.rs:
--------------------------------------------------------------------------------
 1 | use crate::server::autoalloc::AutoAllocService;
 2 | use crate::server::event::streamer::EventStreamer;
 3 | use tako::control::ServerRef;
 4 | 
 5 | pub mod autoalloc;
 6 | pub mod backend;
 7 | pub mod bootstrap;
 8 | pub mod client;
 9 | pub mod event;
10 | pub mod job;
11 | mod restore;
12 | pub mod state;
13 | mod tako_events;
14 | pub mod worker;
15 | 
16 | #[derive(Clone)]
17 | pub struct Senders {
18 |     pub server_control: ServerRef,
19 |     pub events: EventStreamer,
20 |     pub autoalloc: AutoAllocService,
21 | }
22 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/stream/mod.rs:
--------------------------------------------------------------------------------
1 | use crate::common::serialization::TrailingAllowedConfig;
2 | 
3 | pub mod reader;
4 | 
5 | pub type StreamSerializationConfig = TrailingAllowedConfig;
6 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/stream/reader/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod outputlog;
2 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/tests/mod.rs:
--------------------------------------------------------------------------------
1 | #[cfg(test)]
2 | pub mod server;
3 | #[cfg(test)]
4 | pub mod utils;
5 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/tests/utils.rs:
--------------------------------------------------------------------------------
 1 | use crate::common::parser::{NomResult, consume_all};
 2 | use crate::common::parser2::CharParser;
 3 | 
 4 | pub fn check_parse_error<F: FnMut(&str) -> NomResult<O>, O>(
 5 |     parser: F,
 6 |     input: &str,
 7 |     expected_error: &str,
 8 | ) {
 9 |     match consume_all(parser, input) {
10 |         Err(e) => {
11 |             let output = format!("{e:?}");
12 |             assert_eq!(output, expected_error);
13 |         }
14 |         _ => panic!("The parser should have failed"),
15 |     }
16 | }
17 | 
18 | pub fn expect_parser_error<T: std::fmt::Debug>(parser: impl CharParser<T>, input: &str) -> String {
19 |     let error = parser.parse_text(input).unwrap_err();
20 |     format!("{error:?}")
21 | }
22 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/transfer/auth.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Context;
 2 | use orion::kdf::SecretKey;
 3 | 
 4 | const KEY_SIZE: usize = 32;
 5 | 
 6 | pub fn generate_key() -> SecretKey {
 7 |     SecretKey::generate(KEY_SIZE).unwrap()
 8 | }
 9 | pub fn serialize_key(key: &SecretKey) -> String {
10 |     let bytes = key.unprotected_as_bytes();
11 |     hex::encode(bytes)
12 | }
13 | pub fn deserialize_key(key: &str) -> anyhow::Result<SecretKey> {
14 |     let data = hex::decode(key).context("Could not deserialize secret key")?;
15 |     let key = SecretKey::from_slice(&data).context("Could not create secret key from slice")?;
16 |     Ok(key)
17 | }
18 | 
19 | #[cfg(test)]
20 | mod tests {
21 |     use orion::kdf::SecretKey;
22 | 
23 |     use crate::transfer::auth::{deserialize_key, serialize_key};
24 | 
25 |     #[test]
26 |     fn test_roundtrip() {
27 |         let key = SecretKey::default();
28 |         let str = serialize_key(&key);
29 |         let deserialized = deserialize_key(&str).unwrap();
30 |         assert_eq!(key, deserialized);
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/transfer/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod auth;
2 | pub mod connection;
3 | pub mod messages;
4 | pub mod protocol;
5 | pub mod stream;
6 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/transfer/protocol.rs:
--------------------------------------------------------------------------------
1 | // use tokio_util::codec::LengthDelimitedCodec;
2 | // use tokio_util::codec::length_delimited::Builder;
3 | //
4 | // pub fn make_protocol_builder() -> Builder {
5 | //     *LengthDelimitedCodec::builder()
6 | //         .little_endian()
7 | //         .max_frame_length(tako::MAX_FRAME_SIZE)
8 | // }
9 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/transfer/stream.rs:
--------------------------------------------------------------------------------
 1 | use chrono::serde::ts_milliseconds;
 2 | use chrono::{DateTime, Utc};
 3 | use serde::Deserialize;
 4 | use serde::Serialize;
 5 | use tako::{InstanceId, TaskId};
 6 | 
 7 | pub type ChannelId = u32;
 8 | 
 9 | #[derive(Serialize, Deserialize, Debug)]
10 | pub struct StreamChunkHeader {
11 |     #[serde(with = "ts_milliseconds")]
12 |     pub time: DateTime<Utc>,
13 |     pub task: TaskId,
14 |     pub instance: InstanceId,
15 |     pub channel: ChannelId,
16 |     pub size: u64, // size == 0 indicates end of the stream
17 | }
18 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/worker/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod bootstrap;
2 | pub mod hwdetect;
3 | pub mod parser;
4 | pub mod start;
5 | pub mod streamer;
6 | 


--------------------------------------------------------------------------------
/crates/hyperqueue/src/worker/start/mod.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | use std::path::PathBuf;
 3 | use tokio::sync::oneshot::Receiver;
 4 | 
 5 | use crate::transfer::messages::{TaskBuildDescription, TaskKind};
 6 | use tako::InstanceId;
 7 | use tako::launcher::{StopReason, TaskBuildContext, TaskLaunchData, TaskLauncher};
 8 | 
 9 | use crate::worker::start::program::build_program_task;
10 | use crate::worker::streamer::StreamerRef;
11 | 
12 | mod program;
13 | 
14 | pub const WORKER_EXTRA_PROCESS_PID: &str = "ProcessPid";
15 | 
16 | /// Data created when a task is started on a worker.
17 | /// It can be accessed through the state of a running task.
18 | #[derive(Debug, Clone, Serialize, Deserialize)]
19 | pub struct RunningTaskContext {
20 |     pub instance_id: InstanceId,
21 | }
22 | 
23 | pub struct HqTaskLauncher {
24 |     streamer_ref: StreamerRef,
25 | }
26 | 
27 | impl HqTaskLauncher {
28 |     pub fn new(streamer_ref: StreamerRef) -> Self {
29 |         Self { streamer_ref }
30 |     }
31 | }
32 | 
33 | impl TaskLauncher for HqTaskLauncher {
34 |     fn build_task(
35 |         &self,
36 |         build_ctx: TaskBuildContext,
37 |         stop_receiver: Receiver<StopReason>,
38 |     ) -> tako::Result<TaskLaunchData> {
39 |         log::debug!(
40 |             "Starting task launcher task_id={} res={:?} alloc={:?} body_len={}",
41 |             build_ctx.task_id(),
42 |             build_ctx.resources(),
43 |             build_ctx.allocation(),
44 |             build_ctx.body().len(),
45 |         );
46 | 
47 |         let desc: TaskBuildDescription = tako::comm::deserialize(build_ctx.body())?;
48 |         let shared = SharedTaskDescription {
49 |             submit_dir: desc.submit_dir.into_owned(),
50 |             stream_path: desc.stream_path.map(|x| x.into_owned()),
51 |         };
52 |         match desc.task_kind.into_owned() {
53 |             TaskKind::ExternalProgram(program) => build_program_task(
54 |                 build_ctx,
55 |                 stop_receiver,
56 |                 &self.streamer_ref,
57 |                 program,
58 |                 shared,
59 |             ),
60 |         }
61 |     }
62 | }
63 | 
64 | struct SharedTaskDescription {
65 |     submit_dir: PathBuf,
66 |     stream_path: Option<PathBuf>,
67 | }
68 | 


--------------------------------------------------------------------------------
/crates/pyhq/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "pyhq"
 3 | version = "0.22.0"
 4 | authors.workspace = true
 5 | edition.workspace = true
 6 | rust-version.workspace = true
 7 | 
 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 9 | 
10 | [lib]
11 | name = "hyperqueue"
12 | # "cdylib" is necessary to produce a shared library for Python to import from.
13 | #
14 | # Downstream Rust code (including code in `bin/`, `examples/`, and `tests/`) will not be able
15 | # to `use string_sum;` unless the "rlib" or "lib" crate type is also included, e.g.:
16 | # crate-type = ["cdylib", "rlib"]
17 | crate-type = ["cdylib"]
18 | 
19 | [dependencies]
20 | hyperqueue = { path = "../hyperqueue" }
21 | tako = { path = "../tako" }
22 | 
23 | serde = { workspace = true }
24 | tokio = { workspace = true }
25 | anyhow = { workspace = true }
26 | log = { workspace = true }
27 | tempfile = { workspace = true }
28 | 
29 | pyo3 = { version = "0.24", features = ["extension-module", "abi3", "abi3-py39", "anyhow", "serde"] }
30 | pyo3-async-runtimes = { version = "0.24", features = ["tokio-runtime"] }
31 | pythonize = "0.24"
32 | termcolor = "1.1"
33 | 
34 | [package.metadata.maturin]
35 | python-source = "python"
36 | 
37 | [lints]
38 | workspace = true
39 | 


--------------------------------------------------------------------------------
/crates/pyhq/README.md:
--------------------------------------------------------------------------------
 1 | # HyperQueue Python binding
 2 | This package provides a Python binding to HyperQueue.
 3 | 
 4 | ## Development
 5 | 1) Install `maturin`
 6 | ```bash
 7 | $ pip install maturin
 8 | ```
 9 | 2) Build the bindings
10 | ```bash
11 | $ maturin develop
12 | ```
13 | 3) Use the built `hyperqueue` package
14 | 


--------------------------------------------------------------------------------
/crates/pyhq/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["maturin>=1.3,<2"]
 3 | build-backend = "maturin"
 4 | 
 5 | [project]
 6 | name = "hyperqueue"
 7 | classifiers = [
 8 |     "Programming Language :: Python",
 9 |     "Programming Language :: Rust"
10 | ]
11 | requires-python = ">=3.9"
12 | description = "HyperQueue Python API"
13 | dependencies = [
14 |     "cloudpickle>=2.0,<4",
15 |     "tqdm>=4.60,<5"
16 | ]
17 | dynamic = ["version"]
18 | 
19 | [project.optional-dependencies]
20 | all = [
21 |     "pydot>=1.4,<2"
22 | ]
23 | 
24 | [tool.maturin]
25 | features = ["pyo3/extension-module"]
26 | python-source = "python"
27 | 


--------------------------------------------------------------------------------
/crates/pyhq/python/hyperqueue/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is the Python API of HyperQueue.
 3 | 
 4 | Important classes:
 5 | 
 6 | * [`Client`](`hyperqueue.client.Client`) serves for connecting to a HyperQueue server.
 7 | * [`LocalCluster`](`hyperqueue.cluster.LocalCluster`) can be used to spawn a local HyperQueue
 8 | cluster.
 9 | * [`Job`](`hyperqueue.job.Job`) describes a job containing a directed acyclic graph of tasks.
10 | It can be submitted using a client.
11 | """
12 | 
13 | # Re-exports
14 | from .client import Client  # noqa: F401
15 | from .cluster import LocalCluster  # noqa: F401
16 | from .ffi import get_version  # noqa: F401
17 | from .job import Job  # noqa: F401
18 | 


--------------------------------------------------------------------------------
/crates/pyhq/python/hyperqueue/cluster/__init__.py:
--------------------------------------------------------------------------------
 1 | import dataclasses
 2 | import multiprocessing
 3 | from pathlib import Path
 4 | from typing import Optional
 5 | 
 6 | from ..client import Client
 7 | from ..ffi.cluster import Cluster
 8 | 
 9 | 
10 | @dataclasses.dataclass
11 | class WorkerConfig:
12 |     """
13 |     Configuration of a worker spawned by a local cluster.
14 |     """
15 | 
16 |     # If None, use all available cores
17 |     cores: Optional[int] = None
18 | 
19 | 
20 | class LocalCluster:
21 |     """
22 |     Represents a local deployed HyperQueue infrastructure.
23 | 
24 |     You can use `LocalCluster` to quickly spin up a HyperQueue server along with a set of workers
25 |     locally.
26 | 
27 |     The cluster can be used as a context manager. It will be stopped when the context ends:
28 |     ```python
29 |     with LocalCluster() as cluster:
30 |         client = cluster.client()
31 |         ...
32 |     # The cluster was stopped
33 |     ```
34 |     """
35 | 
36 |     def __init__(
37 |         self,
38 |         server_dir: Optional[Path] = None,
39 |         worker_config: Optional[WorkerConfig] = None,
40 |     ):
41 |         """
42 |         :param server_dir: Server directory where will the cluster store its files.
43 |         :param worker_config: Configuration of workers spawned in the cluster.
44 |         """
45 |         self.cluster = Cluster(server_dir)
46 |         if worker_config is not None:
47 |             self.start_worker(worker_config)
48 | 
49 |     def start_worker(self, config: WorkerConfig = None):
50 |         """
51 |         Adds a new worker with the given `config` to the cluster.
52 |         """
53 |         config = config if config is not None else WorkerConfig()
54 |         cores = config.cores or multiprocessing.cpu_count()
55 |         self.cluster.add_worker(cores)
56 | 
57 |     def client(self, **client_args) -> Client:
58 |         """
59 |         Creates a client connected to this cluster.
60 |         """
61 |         return Client(self.cluster.server_dir, **client_args)
62 | 
63 |     def stop(self):
64 |         """
65 |         Stops the server and all workers of this cluster.
66 |         """
67 |         self.cluster.stop()
68 | 
69 |     def __enter__(self):
70 |         return self
71 | 
72 |     def __exit__(self, exc_type, exc_val, exc_tb):
73 |         self.stop()
74 | 


--------------------------------------------------------------------------------
/crates/pyhq/python/hyperqueue/common.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import Union
3 | 
4 | GenericPath = Union[Path, str]
5 | 


--------------------------------------------------------------------------------
/crates/pyhq/python/hyperqueue/ffi/__init__.py:
--------------------------------------------------------------------------------
1 | from .. import hyperqueue as ffi  # noqa
2 | 
3 | JobId = int
4 | TaskId = int
5 | 
6 | 
7 | def get_version() -> str:
8 |     return ffi.get_hq_version()
9 | 


--------------------------------------------------------------------------------
/crates/pyhq/python/hyperqueue/ffi/client.py:
--------------------------------------------------------------------------------
 1 | import dataclasses
 2 | from typing import Dict, List, Optional, Sequence
 3 | 
 4 | from . import JobId, TaskId, ffi
 5 | from .protocol import JobDescription
 6 | 
 7 | 
 8 | class HqClientContext:
 9 |     """
10 |     Opaque class returned from `connect_to_server`.
11 |     Should be passed to FFI methods that require it.
12 |     """
13 | 
14 | 
15 | @dataclasses.dataclass(frozen=True)
16 | class FailedTaskContext:
17 |     error: str
18 |     cwd: Optional[str]
19 |     stdout: Optional[str]
20 |     stderr: Optional[str]
21 | 
22 | 
23 | TaskFailureMap = Dict[JobId, Dict[TaskId, FailedTaskContext]]
24 | 
25 | 
26 | class ClientConnection:
27 |     def __init__(self, directory: Optional[str] = None):
28 |         self.ctx: HqClientContext = ffi.connect_to_server(directory)
29 | 
30 |     def submit_job(self, job_description: JobDescription) -> JobId:
31 |         return ffi.submit_job(self.ctx, job_description)
32 | 
33 |     def wait_for_jobs(self, job_ids: Sequence[JobId], callback) -> List[JobId]:
34 |         """Blocks until jobs are finished. Returns the number of failed tasks"""
35 |         return ffi.wait_for_jobs(self.ctx, job_ids, callback)
36 | 
37 |     def stop_server(self):
38 |         return ffi.stop_server(self.ctx)
39 | 
40 |     def get_failed_tasks(self, job_ids: Sequence[JobId]) -> TaskFailureMap:
41 |         jobs = ffi.get_failed_tasks(self.ctx, job_ids)
42 |         return {
43 |             job_id: {task_id: FailedTaskContext(**data) for (task_id, data) in task_data.items()}
44 |             for (job_id, task_data) in jobs.items()
45 |         }
46 | 
47 |     def forget_job(self, job_id: JobId):
48 |         return ffi.forget_job(self.ctx, job_id)
49 | 


--------------------------------------------------------------------------------
/crates/pyhq/python/hyperqueue/ffi/cluster.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from . import ffi
 4 | 
 5 | 
 6 | class HqClusterContext:
 7 |     """
 8 |     Opaque class returned from `cluster_start`.
 9 |     Should be passed to FFI methods that require it.
10 |     """
11 | 
12 | 
13 | class Cluster:
14 |     def __init__(self, directory: Optional[str] = None):
15 |         self.directory = directory
16 |         self.ctx: HqClusterContext = ffi.cluster_start(directory)
17 | 
18 |     def add_worker(self, cores: int):
19 |         self.ctx.add_worker(cores)
20 | 
21 |     @property
22 |     def server_dir(self) -> str:
23 |         return self.ctx.server_dir
24 | 
25 |     def stop(self):
26 |         return self.ctx.stop()
27 | 


--------------------------------------------------------------------------------
/crates/pyhq/python/hyperqueue/ffi/protocol.py:
--------------------------------------------------------------------------------
 1 | import dataclasses
 2 | import datetime
 3 | from typing import Dict, List, Optional, Sequence, Union
 4 | 
 5 | from ..output import StdioDef
 6 | 
 7 | 
 8 | class ResourceRequest:
 9 |     n_nodes: int = 0
10 |     resources: Dict[str, Union[int, float, str]] = dataclasses.field(default_factory=dict)
11 |     min_time: Optional[datetime.timedelta] = None
12 | 
13 |     def __init__(
14 |         self,
15 |         *,
16 |         n_nodes=0,
17 |         cpus: Union[int, float, str] = 1,
18 |         resources: Optional[Dict[str, Union[int, float, str]]] = None,
19 |         min_time: Optional[datetime.timedelta] = None,
20 |     ):
21 |         self.n_nodes = n_nodes
22 |         if resources is None:
23 |             resources = {}
24 |         resources["cpus"] = cpus
25 |         self.resources = resources
26 |         self.min_time = min_time
27 | 
28 |     def __repr__(self):
29 |         return f"<ResourceRequest n_nodes={self.n_nodes} resources={self.resources} min_time={self.min_time}>"
30 | 
31 | 
32 | @dataclasses.dataclass()
33 | class TaskDescription:
34 |     id: int
35 |     args: List[str]
36 |     cwd: Optional[str]
37 |     stdout: Optional[StdioDef]
38 |     stderr: Optional[StdioDef]
39 |     stdin: Optional[bytes]
40 |     env: Optional[Dict[str, str]]
41 |     dependencies: Sequence[int]
42 |     task_dir: bool
43 |     priority: int
44 |     resource_request: Sequence[ResourceRequest]
45 |     crash_limit: Optional[int]
46 | 
47 | 
48 | @dataclasses.dataclass
49 | class JobDescription:
50 |     name: Optional[str]
51 |     tasks: List[TaskDescription]
52 |     max_fails: Optional[int]
53 | 


--------------------------------------------------------------------------------
/crates/pyhq/python/hyperqueue/task/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It4innovations/hyperqueue/aecde5c53dd41843cc566d87e6fe8036ba26a8fa/crates/pyhq/python/hyperqueue/task/__init__.py


--------------------------------------------------------------------------------
/crates/pyhq/python/hyperqueue/task/function/wrapper.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import logging
 3 | 
 4 | import cloudpickle
 5 | 
 6 | 
 7 | class CloudWrapper:
 8 |     """
 9 |     Wraps a callable so that cloudpickle is used to pickle it, caching the pickle.
10 |     """
11 | 
12 |     def __init__(self, fn, pickled_fn=None, cache=True, protocol=cloudpickle.DEFAULT_PROTOCOL):
13 |         if fn is None:
14 |             if pickled_fn is None:
15 |                 raise ValueError("Pass at least one of `fn` and `pickled_fn`")
16 |             fn = cloudpickle.loads(pickled_fn)
17 |         assert callable(fn)
18 |         # Forget pickled_fn if it should not be cached
19 |         if pickled_fn is not None and not cache:
20 |             pickled_fn = None
21 |         if inspect.isasyncgen(fn):
22 |             raise TypeError("async functions not supported")
23 | 
24 |         self.fn = fn
25 |         self.pickled_fn = pickled_fn
26 |         self.cache = cache
27 |         self.protocol = protocol
28 |         self.__doc__ = "CloudWrapper for {!r}. Original doc:\n\n{}".format(self.fn, self.fn.__doc__)
29 |         if hasattr(self.fn, "__name__"):
30 |             self.__name__ = self.fn.__name__
31 | 
32 |         # Build-in functions does not have signature
33 |         try:
34 |             self.__signature__ = inspect.signature(self.fn)
35 |         except ValueError:
36 |             pass
37 | 
38 |     def is_generator_function(self):
39 |         return inspect.isgeneratorfunction(self.fn)
40 | 
41 |     def __repr__(self):
42 |         return "<{}({!r})>".format(self.__class__.__name__, self.fn)
43 | 
44 |     def _get_pickled_fn(self):
45 |         "Get cloudpickled version of self.fn, optionally caching the result"
46 |         if self.pickled_fn is not None:
47 |             return self.pickled_fn
48 | 
49 |         pfn = cloudpickle.dumps(self.fn, protocol=self.protocol)
50 |         if self.cache:
51 |             self.pickled_fn = pfn
52 |         return pfn
53 | 
54 |     def __call__(self, *args, **kwargs):
55 |         logging.debug(f"Running function {self.fn} using args {args} and kwargs {kwargs}")
56 |         return self.fn(*args, **kwargs)
57 | 
58 |     def __reduce__(self):
59 |         return (
60 |             self.__class__,
61 |             (None, self._get_pickled_fn(), self.cache, self.protocol),
62 |         )
63 | 


--------------------------------------------------------------------------------
/crates/pyhq/python/hyperqueue/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It4innovations/hyperqueue/aecde5c53dd41843cc566d87e6fe8036ba26a8fa/crates/pyhq/python/hyperqueue/utils/__init__.py


--------------------------------------------------------------------------------
/crates/pyhq/python/hyperqueue/utils/package.py:
--------------------------------------------------------------------------------
1 | class MissingPackageException(BaseException):
2 |     def __init__(self, package: str):
3 |         self.package = package
4 | 
5 |     def __str__(self):
6 |         return f"Unable to import `{self.package}`. You have to install the `{self.package}` package."
7 | 


--------------------------------------------------------------------------------
/crates/pyhq/python/hyperqueue/utils/string.py:
--------------------------------------------------------------------------------
1 | def pluralize(text: str, count: int) -> str:
2 |     if count == 1:
3 |         return text
4 |     return f"{text}s"
5 | 


--------------------------------------------------------------------------------
/crates/pyhq/python/hyperqueue/validation.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | 
 4 | class ValidationException(BaseException):
 5 |     pass
 6 | 
 7 | 
 8 | def validate_args(args: List[str]):
 9 |     from .output import Output
10 | 
11 |     for arg in args:
12 |         if not isinstance(arg, (str, Output)):
13 |             raise ValidationException(
14 |                 "Each program argument must either be a string or an instance of `hq.Output`. "
15 |                 f"Argument `{arg}` has type `{type(arg)}`."
16 |             )
17 | 


--------------------------------------------------------------------------------
/crates/pyhq/python/hyperqueue/visualization.py:
--------------------------------------------------------------------------------
 1 | from .common import GenericPath
 2 | from .job import Job
 3 | from .task.task import Task
 4 | from .utils.package import MissingPackageException
 5 | 
 6 | 
 7 | def visualize_job(job: Job, path: GenericPath):
 8 |     """
 9 |     Visualizes the task graph of the passed job in the DOT format.
10 |     The result is written to a file located at `path`.
11 | 
12 |     Note: this function requires the `pydot` package to be installed.
13 |     """
14 | 
15 |     try:
16 |         import pydot
17 |     except ImportError:
18 |         raise MissingPackageException("pydot")
19 | 
20 |     graph = pydot.Dot("job", graph_type="digraph")
21 |     visited = {}
22 | 
23 |     def visit(task: Task):
24 |         nonlocal visited, graph
25 |         if task.task_id in visited:
26 |             return visited[task.task_id]
27 | 
28 |         node = pydot.Node(task.label)
29 |         graph.add_node(node)
30 |         for dep in task.dependencies:
31 |             dep_node = visit(dep)
32 |             edge = pydot.Edge(dep_node.get_name(), node.get_name())
33 |             graph.add_edge(edge)
34 | 
35 |         visited[task.task_id] = node
36 |         return node
37 | 
38 |     for task in job.tasks:
39 |         visit(task)
40 | 
41 |     graph.write(path)
42 | 


--------------------------------------------------------------------------------
/crates/pyhq/src/client/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod job;
2 | pub mod server;
3 | 


--------------------------------------------------------------------------------
/crates/pyhq/src/client/server.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::{Py, PyResult, Python};
 2 | use std::path::PathBuf;
 3 | 
 4 | use hyperqueue::client::default_server_directory_path;
 5 | use hyperqueue::server::bootstrap::get_client_session;
 6 | use hyperqueue::transfer::messages::FromClientMessage;
 7 | 
 8 | use crate::utils::error::ToPyResult;
 9 | use crate::{ClientContextPtr, HqClientContext, borrow_mut, run_future};
10 | 
11 | pub(crate) fn connect_to_server_impl(
12 |     py: Python,
13 |     directory: Option<String>,
14 | ) -> PyResult<ClientContextPtr> {
15 |     let directory = directory
16 |         .map(|p| -> PathBuf { p.into() })
17 |         .unwrap_or_else(default_server_directory_path);
18 | 
19 |     run_future(async move {
20 |         let session = get_client_session(&directory).await?;
21 |         let context = HqClientContext { session };
22 |         Py::new(py, context)
23 |     })
24 | }
25 | 
26 | pub(crate) fn stop_server_impl(py: Python, ctx: ClientContextPtr) -> PyResult<()> {
27 |     run_future(async move {
28 |         let mut ctx = borrow_mut!(py, ctx);
29 |         ctx.session
30 |             .connection()
31 |             .send(FromClientMessage::Stop)
32 |             .await
33 |             .map_py_err()?;
34 |         Ok(())
35 |     })
36 | }
37 | 


--------------------------------------------------------------------------------
/crates/pyhq/src/cluster/mod.rs:
--------------------------------------------------------------------------------
 1 | use crate::cluster::server::RunningServer;
 2 | use crate::cluster::worker::RunningWorker;
 3 | use anyhow::bail;
 4 | use std::path::{Path, PathBuf};
 5 | use tempfile::TempDir;
 6 | 
 7 | pub mod server;
 8 | mod worker;
 9 | 
10 | pub struct Cluster {
11 |     server: Option<RunningServer>,
12 |     workers: Vec<RunningWorker>,
13 |     server_dir: PathBuf,
14 | }
15 | 
16 | impl Cluster {
17 |     pub fn start(server_dir: Option<PathBuf>) -> anyhow::Result<Self> {
18 |         let server_dir =
19 |             server_dir.unwrap_or_else(|| TempDir::with_prefix("hq").unwrap().into_path());
20 |         let server = RunningServer::start(server_dir.clone())?;
21 |         Ok(Self {
22 |             server: Some(server),
23 |             workers: Default::default(),
24 |             server_dir,
25 |         })
26 |     }
27 | 
28 |     pub fn server_dir(&self) -> &Path {
29 |         &self.server_dir
30 |     }
31 | 
32 |     pub fn add_worker(&mut self, cores: usize) -> anyhow::Result<()> {
33 |         if self.server.is_none() {
34 |             bail!("Attempting to add worker to a stopped server");
35 |         }
36 | 
37 |         let worker = RunningWorker::start(self.server_dir(), cores)?;
38 |         self.workers.push(worker);
39 |         Ok(())
40 |     }
41 | 
42 |     pub fn stop(&mut self) -> anyhow::Result<()> {
43 |         if let Some(server) = self.server.take() {
44 |             server.stop();
45 |             Ok(())
46 |         } else {
47 |             Err(anyhow::anyhow!(
48 |                 "Attempting to stop an already stopped server"
49 |             ))
50 |         }
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/crates/pyhq/src/marshal.rs:
--------------------------------------------------------------------------------
 1 | #![allow(unused)]
 2 | 
 3 | use std::ops::{Deref, DerefMut};
 4 | use std::time::Duration;
 5 | 
 6 | use pyo3::types::{PyFloat, PyInt};
 7 | use pyo3::{Bound, FromPyObject, PyAny, PyResult};
 8 | use pythonize::depythonize;
 9 | use serde::de::DeserializeOwned;
10 | 
11 | /// Wrapper type that implements deserialization from Python type using `serde::DeserializeOwned`.
12 | #[derive(Debug)]
13 | pub struct FromPy<T>(T);
14 | 
15 | impl<T> FromPy<T> {
16 |     pub fn extract(self) -> T {
17 |         self.0
18 |     }
19 | }
20 | 
21 | impl<T> Deref for FromPy<T> {
22 |     type Target = T;
23 | 
24 |     fn deref(&self) -> &Self::Target {
25 |         &self.0
26 |     }
27 | }
28 | 
29 | impl<T> DerefMut for FromPy<T> {
30 |     fn deref_mut(&mut self) -> &mut Self::Target {
31 |         &mut self.0
32 |     }
33 | }
34 | 
35 | impl<'source, T> FromPyObject<'source> for FromPy<T>
36 | where
37 |     T: DeserializeOwned,
38 | {
39 |     fn extract_bound(obj: &Bound<'source, PyAny>) -> PyResult<Self> {
40 |         depythonize(obj).map(|v| FromPy(v)).map_err(|e| e.into())
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/crates/pyhq/src/utils/error.rs:
--------------------------------------------------------------------------------
 1 | use hyperqueue::common::error::HqError;
 2 | use pyo3::exceptions::PyException;
 3 | use pyo3::{PyErr, Python};
 4 | 
 5 | pub(crate) trait ToPyError {
 6 |     fn to_py(self) -> PyErr;
 7 | }
 8 | 
 9 | impl ToPyError for HqError {
10 |     fn to_py(self) -> PyErr {
11 |         Python::with_gil(|py| {
12 |             PyErr::from_value(
13 |                 PyException::new_err(format!("{self:?}"))
14 |                     .value(py)
15 |                     .clone()
16 |                     .into_any(),
17 |             )
18 |         })
19 |     }
20 | }
21 | 
22 | impl ToPyError for tako::Error {
23 |     fn to_py(self) -> PyErr {
24 |         HqError::from(self).to_py()
25 |     }
26 | }
27 | 
28 | pub(crate) trait ToPyResult<T> {
29 |     fn map_py_err(self) -> Result<T, PyErr>;
30 | }
31 | 
32 | impl<T, E> ToPyResult<T> for Result<T, E>
33 | where
34 |     E: ToPyError,
35 | {
36 |     fn map_py_err(self) -> Result<T, PyErr> {
37 |         self.map_err(|e| e.to_py())
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/crates/pyhq/src/utils/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod error;
 2 | 
 3 | use std::future::Future;
 4 | use tokio::task::LocalSet;
 5 | 
 6 | /// Borrow a Python object from a Py pointer.
 7 | #[macro_export]
 8 | macro_rules! borrow {
 9 |     ($py:expr, $instance:expr) => {
10 |         $instance.as_ref($py).borrow()
11 |     };
12 | }
13 | 
14 | /// Borrow a Python object mutably from a Py pointer.
15 | #[macro_export]
16 | macro_rules! borrow_mut {
17 |     ($py:expr, $instance:expr) => {
18 |         $instance.bind($py).borrow_mut()
19 |     };
20 | }
21 | 
22 | /// Run the provided future to completion using a global Tokio runtime managed by `pyo3_async_runtimes`.
23 | pub(crate) fn run_future<F: Future>(future: F) -> F::Output {
24 |     let runtime = pyo3_async_runtimes::tokio::get_runtime();
25 |     let set = LocalSet::new();
26 |     set.block_on(runtime, future)
27 | }
28 | 


--------------------------------------------------------------------------------
/crates/tako/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "tako"
 3 | version = "0.1.0"
 4 | authors.workspace = true
 5 | edition.workspace = true
 6 | rust-version.workspace = true
 7 | 
 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 9 | 
10 | [dependencies]
11 | anyhow = { workspace = true }
12 | derive_builder = { workspace = true }
13 | futures = { workspace = true }
14 | tokio = { workspace = true, features = ["full"] }
15 | tokio-util = { workspace = true, features = ["codec"] }
16 | bytes = { workspace = true }
17 | smallvec = { workspace = true, features = ["serde"] }
18 | serde = { workspace = true, features = ["derive", "rc"] }
19 | serde_bytes = { workspace = true }
20 | serde_json = { workspace = true }
21 | thiserror = { workspace = true }
22 | orion = { workspace = true }
23 | bincode = { workspace = true }
24 | log = { workspace = true }
25 | rand = { workspace = true, features = ["small_rng"] }
26 | tracing = { workspace = true }
27 | nix = { workspace = true }
28 | bstr = { workspace = true }
29 | psutil = { workspace = true }
30 | thin-vec = { workspace = true }
31 | bitflags = { workspace = true }
32 | tempfile = { workspace = true }
33 | 
34 | hashbrown = { version = "0.15", features = ["serde", "inline-more"], default-features = false }
35 | priority-queue = "2"
36 | fxhash = "0.2"
37 | derive_more = { version = "2", features = ["add", "add_assign", "sum"] }
38 | 
39 | [dev-dependencies]
40 | env_logger = { workspace = true }
41 | criterion = { workspace = true }
42 | 
43 | [[bench]]
44 | name = "benchmark"
45 | harness = false
46 | 
47 | # Workaround for Criterion (https://bheisler.github.io/criterion.rs/book/faq.html#cargo-bench-gives-unrecognized-option-errors-for-valid-command-line-options)
48 | [lib]
49 | bench = false
50 | 
51 | [lints]
52 | workspace = true
53 | 


--------------------------------------------------------------------------------
/crates/tako/README.md:
--------------------------------------------------------------------------------
 1 | # Tako
 2 | 
 3 | Tako is a framework for running computations on a cluster computer.
 4 | Tako is derived from our previous work on https://github.com/It4innovations/rsds 
 5 | by stripping down Dask-related parts and making the core more generic. 
 6 | 
 7 | Tako is a backend library for "tool builders" and is not intended to be directly used by end-users.
 8 | 
 9 | ## Local development
10 | ### Run benchmarks
11 | ```bash
12 | $ cargo bench
13 | ```
14 | 


--------------------------------------------------------------------------------
/crates/tako/benches/benchmark.rs:
--------------------------------------------------------------------------------
 1 | use criterion::{criterion_group, criterion_main};
 2 | 
 3 | use crate::utils::{add_tasks, create_task, create_worker};
 4 | 
 5 | mod benchmarks;
 6 | mod utils;
 7 | 
 8 | criterion_group!(core, benchmarks::core::benchmark);
 9 | criterion_group!(scheduler, benchmarks::scheduler::benchmark);
10 | criterion_group!(worker, benchmarks::worker::benchmark);
11 | 
12 | criterion_main!(core, scheduler, worker);
13 | 


--------------------------------------------------------------------------------
/crates/tako/benches/benchmarks/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod core;
2 | pub mod scheduler;
3 | pub mod worker;
4 | 


--------------------------------------------------------------------------------
/crates/tako/src/comm.rs:
--------------------------------------------------------------------------------
 1 | pub use crate::internal::transfer::auth::deserialize;
 2 | pub use crate::internal::transfer::auth::serialize;
 3 | pub use crate::internal::transfer::auth::{
 4 |     do_authentication, forward_queue_to_sealed_sink, open_message, seal_message,
 5 | };
 6 | pub use crate::internal::worker::rpc::connect_to_server_and_authenticate;
 7 | use crate::worker::WorkerConfiguration;
 8 | use serde::{Deserialize, Serialize};
 9 | 
10 | #[derive(Serialize, Deserialize, Debug)]
11 | pub struct RegisterWorker {
12 |     pub configuration: WorkerConfiguration,
13 | }
14 | 
15 | #[allow(clippy::large_enum_variant)]
16 | #[derive(Serialize, Deserialize, Debug)]
17 | pub enum ConnectionRegistration {
18 |     Worker(RegisterWorker),
19 |     Custom,
20 | }
21 | 


--------------------------------------------------------------------------------
/crates/tako/src/events.rs:
--------------------------------------------------------------------------------
 1 | use crate::gateway::LostWorkerReason;
 2 | use crate::internal::messages::common::TaskFailInfo;
 3 | use crate::task::SerializedTaskContext;
 4 | use crate::worker::{WorkerConfiguration, WorkerOverview};
 5 | use crate::{InstanceId, TaskId, WorkerId};
 6 | 
 7 | pub trait EventProcessor {
 8 |     fn on_task_finished(&mut self, task_id: TaskId);
 9 |     fn on_task_started(
10 |         &mut self,
11 |         task_id: TaskId,
12 |         instance_id: InstanceId,
13 |         worker_ids: &[WorkerId],
14 |         context: SerializedTaskContext,
15 |     );
16 |     fn on_task_error(
17 |         &mut self,
18 |         task_id: TaskId,
19 |         consumers_id: Vec<TaskId>,
20 |         error_info: TaskFailInfo,
21 |     ) -> Vec<TaskId>;
22 |     fn on_worker_new(&mut self, worker_id: WorkerId, configuration: &WorkerConfiguration);
23 |     fn on_worker_lost(
24 |         &mut self,
25 |         worker_id: WorkerId,
26 |         running_tasks: &[TaskId],
27 |         reason: LostWorkerReason,
28 |     );
29 |     fn on_worker_overview(&mut self, overview: Box<WorkerOverview>);
30 | }
31 | 


--------------------------------------------------------------------------------
/crates/tako/src/hwstats.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | 
 3 | #[derive(Deserialize, Serialize, Debug, Default, Clone)]
 4 | pub struct CpuStats {
 5 |     pub cpu_per_core_percent_usage: Vec<f32>,
 6 | }
 7 | 
 8 | #[derive(Deserialize, Serialize, Debug, Default, Clone)]
 9 | pub struct MemoryStats {
10 |     pub total: u64,
11 |     pub free: u64,
12 | }
13 | 
14 | #[derive(Deserialize, Serialize, Debug, Default, Clone)]
15 | pub struct NetworkStats {
16 |     pub rx_bytes: u64,
17 |     pub tx_bytes: u64,
18 |     pub rx_packets: u64,
19 |     pub tx_packets: u64,
20 |     pub rx_errors: u64,
21 |     pub tx_errors: u64,
22 | }
23 | 
24 | #[derive(Deserialize, Serialize, Debug, Clone)]
25 | pub struct GpuStats {
26 |     pub id: String,
27 |     pub processor_usage: f32,
28 |     pub mem_usage: f32,
29 | }
30 | 
31 | #[derive(Deserialize, Serialize, Debug, Default, Clone)]
32 | pub struct GpuCollectionStats {
33 |     pub gpus: Vec<GpuStats>,
34 | }
35 | 
36 | #[derive(Deserialize, Serialize, Debug, Default, Clone)]
37 | pub struct WorkerHwState {
38 |     pub cpu_usage: CpuStats,
39 |     pub memory_usage: MemoryStats,
40 |     pub network_usage: NetworkStats,
41 |     pub nvidia_gpus: Option<GpuCollectionStats>,
42 |     pub amd_gpus: Option<GpuCollectionStats>,
43 |     pub timestamp: u64,
44 | }
45 | 
46 | #[derive(Deserialize, Serialize, Debug, Clone)]
47 | pub struct WorkerHwStateMessage {
48 |     pub state: WorkerHwState,
49 | }
50 | 
51 | #[derive(Deserialize, Serialize, Hash, Debug, PartialEq, Eq, Copy, Clone)]
52 | pub enum GpuFamily {
53 |     Nvidia,
54 |     Amd,
55 | }
56 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/common/error.rs:
--------------------------------------------------------------------------------
 1 | use thiserror::Error;
 2 | 
 3 | #[allow(clippy::enum_variant_names)]
 4 | #[derive(Debug, Error)]
 5 | pub enum DsError {
 6 |     #[error("IO error: {0}")]
 7 |     IoError(#[from] std::io::Error),
 8 |     #[error("Serialization error: {0}")]
 9 |     SerializationError(String),
10 |     #[error("Scheduler error: {0}")]
11 |     SchedulerError(String),
12 |     #[error("{0}")]
13 |     GenericError(String),
14 | }
15 | 
16 | impl From<serde_json::error::Error> for DsError {
17 |     fn from(e: serde_json::error::Error) -> Self {
18 |         Self::SerializationError(e.to_string())
19 |     }
20 | }
21 | impl From<bincode::Error> for DsError {
22 |     fn from(e: bincode::Error) -> Self {
23 |         Self::SerializationError(e.to_string())
24 |     }
25 | }
26 | impl From<psutil::Error> for DsError {
27 |     fn from(e: psutil::Error) -> Self {
28 |         Self::GenericError(e.to_string())
29 |     }
30 | }
31 | impl From<String> for DsError {
32 |     fn from(e: String) -> Self {
33 |         Self::GenericError(e)
34 |     }
35 | }
36 | impl From<&str> for DsError {
37 |     fn from(e: &str) -> Self {
38 |         Self::GenericError(e.to_string())
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/common/ids.rs:
--------------------------------------------------------------------------------
 1 | use crate::define_id_type;
 2 | use serde::{Deserialize, Serialize};
 3 | use std::fmt::{Debug, Display, Formatter};
 4 | 
 5 | define_id_type!(JobId, u32);
 6 | define_id_type!(JobTaskId, u32);
 7 | define_id_type!(WorkerId, u32);
 8 | define_id_type!(InstanceId, u32);
 9 | 
10 | #[derive(Default, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq, Serialize, Deserialize)]
11 | pub struct TaskId {
12 |     job_id: JobId,
13 |     job_task_id: JobTaskId,
14 | }
15 | 
16 | impl Display for TaskId {
17 |     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
18 |         write!(f, "{}@{}", self.job_id, self.job_task_id)
19 |     }
20 | }
21 | 
22 | impl Debug for TaskId {
23 |     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
24 |         Display::fmt(self, f)
25 |     }
26 | }
27 | 
28 | impl TaskId {
29 |     #[inline]
30 |     pub fn new(job_id: JobId, job_task_id: JobTaskId) -> Self {
31 |         Self {
32 |             job_id,
33 |             job_task_id,
34 |         }
35 |     }
36 | 
37 |     #[inline]
38 |     pub fn job_id(&self) -> JobId {
39 |         self.job_id
40 |     }
41 | 
42 |     #[inline]
43 |     pub fn job_task_id(&self) -> JobTaskId {
44 |         self.job_task_id
45 |     }
46 | 
47 |     pub fn new_test(job_task_id: u32) -> Self {
48 |         Self {
49 |             job_id: 0.into(),
50 |             job_task_id: job_task_id.into(),
51 |         }
52 |     }
53 | }
54 | 
55 | #[cfg(test)]
56 | impl From<u32> for TaskId {
57 |     fn from(value: u32) -> Self {
58 |         Self::new_test(value)
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/common/mod.rs:
--------------------------------------------------------------------------------
 1 | #[macro_use]
 2 | pub(crate) mod trace;
 3 | 
 4 | pub(crate) mod data_structures;
 5 | pub(crate) mod error;
 6 | pub(crate) mod ids;
 7 | pub(crate) mod index;
 8 | pub mod resources;
 9 | pub(crate) mod rpc;
10 | pub(crate) mod stablemap;
11 | pub(crate) mod taskgroup;
12 | pub(crate) mod utils;
13 | pub(crate) mod wrapped;
14 | 
15 | pub use data_structures::{Map, Set};
16 | pub use wrapped::WrappedRcRefCell;
17 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/common/resources/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod allocation;
 2 | pub mod amount;
 3 | pub mod descriptor;
 4 | pub mod map;
 5 | pub mod request;
 6 | 
 7 | use crate::define_id_type;
 8 | use crate::internal::common::index::IndexVec;
 9 | pub use allocation::{Allocation, ResourceAllocation};
10 | pub use descriptor::{ResourceDescriptor, ResourceDescriptorItem, ResourceDescriptorKind};
11 | pub use map::{
12 |     AMD_GPU_RESOURCE_NAME, CPU_RESOURCE_ID, CPU_RESOURCE_NAME, MEM_RESOURCE_NAME,
13 |     NVIDIA_GPU_RESOURCE_NAME,
14 | };
15 | pub use request::{
16 |     AllocationRequest, ResourceRequest, ResourceRequestEntries, ResourceRequestEntry,
17 |     ResourceRequestVariants, TimeRequest,
18 | };
19 | 
20 | pub use amount::{ResourceAmount, ResourceFractions, ResourceUnits};
21 | 
22 | pub type NumOfNodes = u32;
23 | 
24 | // Identifies a globally unique Resource request stored in Core.
25 | define_id_type!(ResourceId, u32);
26 | 
27 | // Represents an index within a single generic resource (e.g. GPU with ID 1).
28 | define_id_type!(ResourceIndex, u32);
29 | 
30 | // Represents a label of an individual resource provided by a worker
31 | pub type ResourceLabel = String;
32 | 
33 | pub type ResourceVec<T> = IndexVec<ResourceId, T>;
34 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/common/rpc.rs:
--------------------------------------------------------------------------------
 1 | /*pub(crate) async fn forward_queue_to_sink<
 2 |     T1,
 3 |     T2,
 4 |     E,
 5 |     S: Sink<T2, Error = E> + Unpin,
 6 |     F: Fn(T1) -> T2,
 7 | >(
 8 |     mut queue: UnboundedReceiver<T1>,
 9 |     mut sink: S,
10 |     map_fn: F,
11 | ) -> Result<(), E> {
12 |     while let Some(data) = queue.recv().await {
13 |         if let Err(e) = sink.send(map_fn(data)).await {
14 |             log::error!("Forwarding from queue failed");
15 |             return Err(e);
16 |         }
17 |     }
18 |     Ok(())
19 | }*/
20 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/common/taskgroup.rs:
--------------------------------------------------------------------------------
 1 | use std::future::Future;
 2 | use tokio::task::LocalSet;
 3 | 
 4 | /// Enables spawning a group of tasks that will no longer be polled after the group is dropped.
 5 | /// Does not allow getting the results of the individual tasks, they thus have to be
 6 | /// "fire-and-forget" tasks.
 7 | #[derive(Default)]
 8 | pub struct TaskGroup {
 9 |     set: LocalSet,
10 | }
11 | 
12 | impl TaskGroup {
13 |     /// Runs the passed future, along with all tasks within the group, until the future finishes.
14 |     pub async fn run_until<F: Future<Output = R>, R>(&self, future: F) -> R {
15 |         self.set.run_until(future).await
16 |     }
17 | 
18 |     /// Add a new task to the group.
19 |     pub fn add_task<F: Future + 'static>(&self, future: F) {
20 |         self.set.spawn_local(future);
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/common/trace.rs:
--------------------------------------------------------------------------------
 1 | pub struct ScopedTimer<'a> {
 2 |     process: &'a str,
 3 |     method: &'static str,
 4 | }
 5 | 
 6 | impl<'a> ScopedTimer<'a> {
 7 |     pub fn new(process: &'a str, method: &'static str) -> Self {
 8 |         tracing::info!(
 9 |             action = "measure",
10 |             process = process,
11 |             method = method,
12 |             event = "start"
13 |         );
14 |         Self { process, method }
15 |     }
16 | }
17 | 
18 | impl Drop for ScopedTimer<'_> {
19 |     fn drop(&mut self) {
20 |         tracing::info!(
21 |             action = "measure",
22 |             method = self.method,
23 |             process = self.process,
24 |             event = "end"
25 |         );
26 |     }
27 | }
28 | 
29 | macro_rules! trace_time {
30 |     ($process:tt, $method:tt, $block:expr) => {{
31 |         let _timer = $crate::internal::common::trace::ScopedTimer::new($process, $method);
32 |         $block
33 |     }};
34 | }
35 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/common/utils.rs:
--------------------------------------------------------------------------------
 1 | use std::fmt::Display;
 2 | use std::hash::Hash;
 3 | 
 4 | /// Checks at compile-time that the given type $ty has the corresponding $size.
 5 | ///
 6 | /// It can be used to prevent performance-critical data structures to grow in size unexpectedly.
 7 | #[macro_export]
 8 | macro_rules! static_assert_size {
 9 |     ($ty:ty, $size:expr) => {
10 |         #[cfg(target_arch = "x86_64")]
11 |         const _: [(); $size] = [(); ::std::mem::size_of::<$ty>()];
12 |     };
13 | }
14 | 
15 | use crate::Set;
16 | 
17 | pub fn format_comma_delimited<I: IntoIterator<Item = T>, T: Display>(iter: I) -> String {
18 |     iter.into_iter()
19 |         .map(|item| item.to_string())
20 |         .collect::<Vec<_>>()
21 |         .join(",")
22 | }
23 | 
24 | pub fn has_unique_elements<T: Eq + Hash>(items: &[T]) -> bool {
25 |     items.iter().collect::<Set<&T>>().len() == items.len()
26 | }
27 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/datasrv/dataobj.rs:
--------------------------------------------------------------------------------
 1 | use crate::{TaskId, define_id_type};
 2 | use serde::{Deserialize, Serialize};
 3 | use std::fmt::{Debug, Display, Formatter};
 4 | use std::rc::Rc;
 5 | 
 6 | define_id_type!(OutputId, u32);
 7 | define_id_type!(DataInputId, u32);
 8 | 
 9 | #[derive(Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize, PartialOrd, Ord)]
10 | pub struct DataObjectId {
11 |     pub task_id: TaskId,
12 |     pub data_id: OutputId,
13 | }
14 | 
15 | impl DataObjectId {
16 |     pub fn new(task_id: TaskId, data_id: OutputId) -> Self {
17 |         Self { task_id, data_id }
18 |     }
19 | }
20 | 
21 | impl Display for DataObjectId {
22 |     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
23 |         write!(f, "{}/{}", self.task_id, self.data_id)
24 |     }
25 | }
26 | 
27 | impl Debug for DataObjectId {
28 |     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
29 |         write!(f, "{}/{}", self.task_id, self.data_id)
30 |     }
31 | }
32 | 
33 | #[derive(Serialize, Deserialize)]
34 | pub struct DataObject {
35 |     mime_type: Option<String>,
36 | 
37 |     #[serde(with = "serde_bytes")]
38 |     data: Vec<u8>,
39 | }
40 | 
41 | impl Debug for DataObject {
42 |     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
43 |         write!(
44 |             f,
45 |             "<MemDataObj mimetype={:?} size={}>",
46 |             self.mime_type,
47 |             self.size(),
48 |         )
49 |     }
50 | }
51 | 
52 | impl DataObject {
53 |     pub fn new(mime_type: Option<String>, data: Vec<u8>) -> Self {
54 |         DataObject { mime_type, data }
55 |     }
56 | 
57 |     pub fn size(&self) -> u64 {
58 |         self.data.len() as u64
59 |     }
60 | 
61 |     pub fn data(&self) -> &[u8] {
62 |         &self.data
63 |     }
64 | 
65 |     pub fn mime_type(&self) -> Option<&String> {
66 |         self.mime_type.as_ref()
67 |     }
68 | }
69 | 
70 | pub type DataObjectRef = Rc<DataObject>;
71 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/datasrv/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod dataobj;
 2 | mod datastorage;
 3 | pub mod download;
 4 | pub(crate) mod local_client;
 5 | pub(crate) mod messages;
 6 | mod upload;
 7 | pub(crate) mod utils;
 8 | 
 9 | #[cfg(test)]
10 | mod test_utils;
11 | #[cfg(test)]
12 | mod tests;
13 | 
14 | pub(crate) use dataobj::{DataObject, DataObjectRef};
15 | pub(crate) use datastorage::DataStorage;
16 | pub(crate) use download::{DownloadInterface, DownloadManagerRef};
17 | pub(crate) use upload::{UploadInterface, data_upload_service};
18 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/datasrv/utils.rs:
--------------------------------------------------------------------------------
 1 | use crate::internal::datasrv::DataObject;
 2 | use crate::internal::datasrv::messages::DataObjectSlice;
 3 | use std::rc::Rc;
 4 | 
 5 | pub(crate) const UPLOAD_CHUNK_SIZE: usize = 128 * 1024; // 128KiB
 6 | 
 7 | pub(crate) struct DataObjectDecomposer {
 8 |     data_obj: Rc<DataObject>,
 9 |     end: usize,
10 | }
11 | 
12 | impl DataObjectDecomposer {
13 |     pub fn new(data_obj: Rc<DataObject>) -> (Self, DataObjectSlice) {
14 |         let size = data_obj.size() as usize;
15 |         let end = size.min(UPLOAD_CHUNK_SIZE);
16 |         (
17 |             DataObjectDecomposer {
18 |                 data_obj: data_obj.clone(),
19 |                 end,
20 |             },
21 |             DataObjectSlice {
22 |                 data_object: data_obj,
23 |                 start: 0,
24 |                 end,
25 |             },
26 |         )
27 |     }
28 |     pub fn next(&mut self) -> Option<DataObjectSlice> {
29 |         let size = self.data_obj.size() as usize;
30 |         if self.end < size {
31 |             let start = self.end;
32 |             self.end = (self.end + UPLOAD_CHUNK_SIZE).min(size);
33 |             Some(DataObjectSlice {
34 |                 data_object: self.data_obj.clone(),
35 |                 start,
36 |                 end: self.end,
37 |             })
38 |         } else {
39 |             None
40 |         }
41 |     }
42 | }
43 | 
44 | pub(crate) struct DataObjectComposer {
45 |     size: usize,
46 |     data: Vec<u8>,
47 | }
48 | 
49 | impl DataObjectComposer {
50 |     pub fn new(size: usize, mut data: Vec<u8>) -> Self {
51 |         assert!(size >= data.len());
52 |         if data.capacity() < size {
53 |             data.reserve(size - data.len());
54 |         }
55 |         DataObjectComposer { size, data }
56 |     }
57 | 
58 |     pub fn add(&mut self, mut data: Vec<u8>) -> usize {
59 |         assert!(data.len() + self.data.len() <= self.size);
60 |         self.data.append(&mut data);
61 |         self.data.len()
62 |     }
63 | 
64 |     pub fn is_finished(&self) -> bool {
65 |         self.size == self.data.len()
66 |     }
67 | 
68 |     pub fn finish(self, mime_type: Option<String>) -> DataObject {
69 |         DataObject::new(mime_type, self.data)
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/messages/auth.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | use std::borrow::Cow;
 3 | 
 4 | #[derive(Serialize, Deserialize, Debug)]
 5 | pub(crate) struct Challenge {
 6 |     #[serde(with = "serde_bytes")]
 7 |     pub challenge: Vec<u8>,
 8 | }
 9 | 
10 | #[derive(Serialize, Deserialize, Debug)]
11 | //#[serde(tag = "op")]
12 | pub(crate) enum AuthenticationMode {
13 |     NoAuth,
14 |     Encryption(Challenge),
15 | }
16 | 
17 | #[derive(Serialize, Deserialize, Debug)]
18 | //#[serde(tag = "op")]
19 | pub(crate) struct AuthenticationRequest {
20 |     pub protocol: u32,
21 |     pub role: Cow<'static, str>,
22 |     pub mode: AuthenticationMode,
23 | }
24 | 
25 | #[derive(Serialize, Deserialize, Debug)]
26 | pub(crate) struct EncryptionResponse {
27 |     #[serde(with = "serde_bytes")]
28 |     pub response: Vec<u8>,
29 | 
30 |     #[serde(with = "serde_bytes")]
31 |     pub nonce: Vec<u8>,
32 | }
33 | 
34 | #[derive(Serialize, Deserialize, Debug)]
35 | pub(crate) struct AuthenticationError {
36 |     pub message: String,
37 | }
38 | 
39 | #[derive(Serialize, Deserialize, Debug)]
40 | //#[serde(tag = "op")]
41 | pub(crate) enum AuthenticationResponse {
42 |     NoAuth,
43 |     Encryption(EncryptionResponse),
44 |     Error(AuthenticationError),
45 | }
46 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/messages/common.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | 
 3 | #[derive(Serialize, Deserialize, Debug, Clone)]
 4 | pub struct TaskFailInfo {
 5 |     pub message: String,
 6 | }
 7 | 
 8 | impl TaskFailInfo {
 9 |     pub fn from_string(message: String) -> Self {
10 |         TaskFailInfo { message }
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/messages/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod common;
2 | 
3 | pub(crate) mod auth;
4 | pub mod worker;
5 | 
6 | pub use worker::WorkerOverview;
7 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/mod.rs:
--------------------------------------------------------------------------------
 1 | #[macro_use]
 2 | pub(crate) mod common;
 3 | pub mod messages;
 4 | pub mod scheduler;
 5 | pub mod server;
 6 | pub(crate) mod transfer;
 7 | pub mod worker;
 8 | 
 9 | pub(crate) mod datasrv;
10 | pub mod tests;
11 | 
12 | pub use common::utils::has_unique_elements;
13 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/scheduler/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod multinode;
2 | pub(crate) mod query;
3 | pub mod state;
4 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/server/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod client;
 2 | pub mod comm;
 3 | pub mod core;
 4 | pub(crate) mod dataobj;
 5 | pub(crate) mod dataobjmap;
 6 | pub(crate) mod explain;
 7 | pub mod reactor;
 8 | pub mod rpc;
 9 | pub mod task;
10 | pub mod taskmap;
11 | pub mod worker;
12 | pub mod workergroup;
13 | pub mod workerload;
14 | pub mod workermap;
15 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/server/taskmap.rs:
--------------------------------------------------------------------------------
 1 | use crate::TaskId;
 2 | use crate::internal::common::stablemap::StableMap;
 3 | use crate::internal::server::task::Task;
 4 | 
 5 | #[derive(Default, Debug)]
 6 | pub struct TaskMap {
 7 |     tasks: StableMap<TaskId, Task>,
 8 | }
 9 | 
10 | impl TaskMap {
11 |     // Insertion
12 |     #[inline(always)]
13 |     pub fn insert(&mut self, task: Task) -> Option<Task> {
14 |         self.tasks.insert(task);
15 |         // StableMap panics on duplicate insertion
16 |         None
17 |     }
18 | 
19 |     // Removal
20 |     #[inline(always)]
21 |     pub fn remove(&mut self, task_id: TaskId) -> Option<Task> {
22 |         self.tasks.remove(&task_id)
23 |     }
24 | 
25 |     // Accessors
26 |     #[inline(always)]
27 |     pub fn get_task(&self, task_id: TaskId) -> &Task {
28 |         self.tasks.find(&task_id).unwrap_or_else(|| {
29 |             panic!("Asking for invalid task id={task_id}");
30 |         })
31 |     }
32 | 
33 |     #[inline(always)]
34 |     pub fn get_task_mut(&mut self, task_id: TaskId) -> &mut Task {
35 |         self.tasks.find_mut(&task_id).unwrap_or_else(|| {
36 |             panic!("Asking for invalid task id={task_id}");
37 |         })
38 |     }
39 | 
40 |     #[inline(always)]
41 |     pub fn find_task(&self, task_id: TaskId) -> Option<&Task> {
42 |         self.tasks.find(&task_id)
43 |     }
44 | 
45 |     #[inline(always)]
46 |     pub fn find_task_mut(&mut self, task_id: TaskId) -> Option<&mut Task> {
47 |         self.tasks.find_mut(&task_id)
48 |     }
49 | 
50 |     // Iteration
51 |     #[inline(always)]
52 |     pub fn task_ids(&self) -> impl Iterator<Item = TaskId> + '_ {
53 |         self.tasks.keys().copied()
54 |     }
55 | 
56 |     #[inline(always)]
57 |     pub fn tasks(&self) -> impl Iterator<Item = &Task> {
58 |         self.tasks.values()
59 |     }
60 | 
61 |     #[inline(always)]
62 |     pub fn len(&self) -> usize {
63 |         self.tasks.len()
64 |     }
65 | 
66 |     #[inline(always)]
67 |     pub fn is_empty(&self) -> bool {
68 |         self.tasks.is_empty()
69 |     }
70 | 
71 |     pub fn shrink_to_fit(&mut self) {
72 |         self.tasks.shrink_to_fit();
73 |     }
74 | }
75 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/server/workergroup.rs:
--------------------------------------------------------------------------------
 1 | use crate::{Set, WorkerId};
 2 | 
 3 | pub struct WorkerGroup {
 4 |     worker_ids: Set<WorkerId>,
 5 | }
 6 | 
 7 | impl WorkerGroup {
 8 |     pub fn new(worker_ids: Set<WorkerId>) -> Self {
 9 |         WorkerGroup { worker_ids }
10 |     }
11 | 
12 |     pub fn worker_ids(&self) -> impl Iterator<Item = WorkerId> + '_ {
13 |         self.worker_ids.iter().copied()
14 |     }
15 | 
16 |     pub fn new_worker(&mut self, worker_id: WorkerId) {
17 |         assert!(self.worker_ids.insert(worker_id));
18 |     }
19 | 
20 |     pub fn remove_worker(&mut self, worker_id: WorkerId) {
21 |         assert!(self.worker_ids.remove(&worker_id));
22 |     }
23 | 
24 |     pub fn size(&self) -> usize {
25 |         self.worker_ids.len()
26 |     }
27 | 
28 |     pub fn is_empty(&self) -> bool {
29 |         self.worker_ids.is_empty()
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/server/workermap.rs:
--------------------------------------------------------------------------------
 1 | use std::ops::{Deref, DerefMut};
 2 | 
 3 | use crate::WorkerId;
 4 | use crate::internal::common::Map;
 5 | use crate::internal::server::worker::Worker;
 6 | 
 7 | #[derive(Default, Debug)]
 8 | pub struct WorkerMap {
 9 |     workers: Map<WorkerId, Worker>,
10 | }
11 | 
12 | impl WorkerMap {
13 |     #[inline]
14 |     pub fn get_worker(&self, worker_id: WorkerId) -> &Worker {
15 |         &self.workers[&worker_id]
16 |     }
17 | 
18 |     #[inline]
19 |     pub fn get_worker_mut(&mut self, worker_id: WorkerId) -> &mut Worker {
20 |         self.workers.get_mut(&worker_id).expect("Worker not found")
21 |     }
22 | 
23 |     #[inline]
24 |     pub fn get_workers_mut(&mut self) -> impl Iterator<Item = &mut Worker> {
25 |         self.workers.values_mut()
26 |     }
27 | }
28 | 
29 | impl Deref for WorkerMap {
30 |     type Target = Map<WorkerId, Worker>;
31 | 
32 |     #[inline]
33 |     fn deref(&self) -> &Self::Target {
34 |         &self.workers
35 |     }
36 | }
37 | impl DerefMut for WorkerMap {
38 |     #[inline]
39 |     fn deref_mut(&mut self) -> &mut Self::Target {
40 |         &mut self.workers
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/tests/integration/mod.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(test)]
 2 | mod test_basic;
 3 | #[cfg(test)]
 4 | mod test_resources;
 5 | #[cfg(test)]
 6 | mod test_secret;
 7 | #[cfg(test)]
 8 | mod test_worker;
 9 | 
10 | pub mod utils;
11 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/tests/integration/utils/mod.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | pub mod api;
 4 | pub mod server;
 5 | pub mod task;
 6 | pub mod worker;
 7 | 
 8 | pub fn check_file_contents(path: &Path, expected: &[u8]) {
 9 |     assert_eq!(std::fs::read(path).unwrap(), expected.to_vec());
10 | }
11 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/tests/mod.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(test)]
 2 | mod test_query;
 3 | #[cfg(test)]
 4 | mod test_reactor;
 5 | #[cfg(test)]
 6 | pub mod test_scheduler_mn;
 7 | #[cfg(test)]
 8 | mod test_scheduler_sn;
 9 | #[cfg(test)]
10 | mod test_worker;
11 | 
12 | pub mod integration;
13 | pub mod utils;
14 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/tests/utils/mod.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(test)]
 2 | pub mod env;
 3 | #[cfg(test)]
 4 | pub mod resources;
 5 | #[cfg(test)]
 6 | pub mod schedule;
 7 | #[cfg(test)]
 8 | pub mod task;
 9 | #[cfg(test)]
10 | pub mod worker;
11 | #[cfg(test)]
12 | pub mod workflows;
13 | 
14 | pub mod shared;
15 | 
16 | pub fn sorted_vec<T: Ord>(mut vec: Vec<T>) -> Vec<T> {
17 |     vec.sort();
18 |     vec
19 | }
20 | 
21 | #[allow(unused)]
22 | #[cfg(test)]
23 | pub fn enable_test_logging() {
24 |     env_logger::builder().is_test(false).init()
25 | }
26 | 
27 | #[cfg(test)]
28 | pub fn expect_error_message<T>(result: anyhow::Result<T>, msg: &str) {
29 |     match result {
30 |         Ok(_) => panic!("Expected error, got Ok"),
31 |         Err(error) => {
32 |             let formatted = format!("{error:?}");
33 |             if !formatted.contains(msg) {
34 |                 panic!("Did not find `{msg}` in `{formatted}`");
35 |             }
36 |         }
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/tests/utils/shared.rs:
--------------------------------------------------------------------------------
 1 | use crate::internal::worker::resources::allocator::ResourceAllocator;
 2 | use crate::internal::worker::resources::map::ResourceLabelMap;
 3 | use crate::resources::{
 4 |     ResourceAmount, ResourceDescriptor, ResourceDescriptorItem, ResourceDescriptorKind, ResourceMap,
 5 | };
 6 | 
 7 | pub fn res_kind_range(start: u32, end: u32) -> ResourceDescriptorKind {
 8 |     ResourceDescriptorKind::Range {
 9 |         start: start.into(),
10 |         end: end.into(),
11 |     }
12 | }
13 | 
14 | pub fn res_kind_list(items: &[&str]) -> ResourceDescriptorKind {
15 |     ResourceDescriptorKind::List {
16 |         values: items.iter().map(|&v| v.to_string()).collect(),
17 |     }
18 | }
19 | 
20 | pub fn res_kind_groups(groups: &[Vec<&str>]) -> ResourceDescriptorKind {
21 |     ResourceDescriptorKind::Groups {
22 |         groups: groups
23 |             .iter()
24 |             .map(|v| v.iter().map(|s| s.to_string()).collect())
25 |             .collect(),
26 |     }
27 | }
28 | 
29 | pub fn res_kind_sum(size: u32) -> ResourceDescriptorKind {
30 |     ResourceDescriptorKind::Sum {
31 |         size: ResourceAmount::new_units(size),
32 |     }
33 | }
34 | 
35 | pub fn res_item(name: &str, kind: ResourceDescriptorKind) -> ResourceDescriptorItem {
36 |     ResourceDescriptorItem {
37 |         name: name.to_string(),
38 |         kind,
39 |     }
40 | }
41 | 
42 | pub fn res_allocator_from_descriptor(descriptor: ResourceDescriptor) -> ResourceAllocator {
43 |     let mut names = vec![];
44 |     for item in &descriptor.resources {
45 |         names.push(item.name.clone());
46 |     }
47 | 
48 |     let resource_map = ResourceMap::from_vec(names);
49 |     let label_resource_map = ResourceLabelMap::new(&descriptor, &resource_map);
50 |     let allocator = ResourceAllocator::new(&descriptor, &resource_map, &label_resource_map);
51 |     allocator.validate();
52 |     allocator
53 | }
54 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/tests/utils/worker.rs:
--------------------------------------------------------------------------------
 1 | use crate::internal::messages::worker::FromWorkerMessage;
 2 | 
 3 | pub struct TestWorkerComm {
 4 |     messages: Vec<FromWorkerMessage>,
 5 |     worker_is_empty_notifications: usize,
 6 |     start_task_notifications: usize,
 7 | }
 8 | 
 9 | impl Default for TestWorkerComm {
10 |     fn default() -> Self {
11 |         Self::new()
12 |     }
13 | }
14 | 
15 | impl TestWorkerComm {
16 |     pub fn new() -> Self {
17 |         TestWorkerComm {
18 |             messages: Vec::new(),
19 |             worker_is_empty_notifications: 0,
20 |             start_task_notifications: 0,
21 |         }
22 |     }
23 | 
24 |     pub fn check_emptiness(&self) {
25 |         assert!(self.messages.is_empty());
26 |         assert_eq!(self.worker_is_empty_notifications, 0);
27 |         assert_eq!(self.start_task_notifications, 0);
28 |     }
29 | 
30 |     pub fn take_start_task_notifications(&mut self) -> usize {
31 |         std::mem::take(&mut self.start_task_notifications)
32 |     }
33 | 
34 |     pub fn check_start_task_notifications(&mut self, count: usize) {
35 |         assert_eq!(self.take_start_task_notifications(), count);
36 |     }
37 | 
38 |     pub fn send_message_to_server(&mut self, message: FromWorkerMessage) {
39 |         self.messages.push(message);
40 |     }
41 | 
42 |     pub fn notify_worker_is_empty(&mut self) {
43 |         self.worker_is_empty_notifications += 1;
44 |     }
45 | 
46 |     pub fn notify_start_task(&mut self) {
47 |         self.start_task_notifications += 1;
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/transfer/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod auth;
2 | pub mod transport;
3 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/transfer/transport.rs:
--------------------------------------------------------------------------------
1 | use tokio_util::codec::length_delimited::{Builder, LengthDelimitedCodec};
2 | 
3 | #[inline]
4 | pub fn make_protocol_builder() -> Builder {
5 |     *LengthDelimitedCodec::builder()
6 |         .little_endian()
7 |         .max_frame_length(crate::MAX_FRAME_SIZE)
8 | }
9 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/worker/data/download.rs:
--------------------------------------------------------------------------------
 1 | use crate::PriorityTuple;
 2 | use crate::datasrv::DataObjectId;
 3 | use crate::internal::datasrv::{DataObjectRef, DownloadInterface, DownloadManagerRef};
 4 | use crate::internal::worker::state::WorkerStateRef;
 5 | use tokio::sync::oneshot;
 6 | use tokio::sync::oneshot::Receiver;
 7 | 
 8 | impl DownloadInterface for WorkerStateRef {
 9 |     fn find_placement(&self, data_id: DataObjectId) -> Receiver<Option<String>> {
10 |         let (sender, receiver) = oneshot::channel();
11 |         self.get_mut().ask_for_data_placement(data_id, sender);
12 |         receiver
13 |     }
14 | 
15 |     fn on_download_finished(&self, data_id: DataObjectId, data_ref: DataObjectRef) {
16 |         let mut state = self.get_mut();
17 |         state.on_download_finished(data_id, data_ref);
18 |     }
19 | 
20 |     fn on_download_failed(&self, data_id: DataObjectId) {
21 |         let mut state = self.get_mut();
22 |         state.on_download_failed(data_id);
23 |     }
24 | }
25 | 
26 | pub(crate) type WorkerDownloadManagerRef = DownloadManagerRef<WorkerStateRef, PriorityTuple>;
27 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/worker/data/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod download;
2 | mod localcomm;
3 | mod upload;
4 | 
5 | pub(crate) use localcomm::datanode_local_connection_handler;
6 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/worker/data/upload.rs:
--------------------------------------------------------------------------------
 1 | use crate::datasrv::DataObjectId;
 2 | use crate::internal::datasrv::{DataObjectRef, UploadInterface};
 3 | use crate::internal::worker::state::WorkerStateRef;
 4 | 
 5 | impl UploadInterface for WorkerStateRef {
 6 |     fn get_object(&self, data_id: DataObjectId) -> Option<DataObjectRef> {
 7 |         self.get().data_storage.get_object(data_id).cloned()
 8 |     }
 9 | 
10 |     fn upload_finished(&self, size: u64) {
11 |         self.get_mut().data_storage.add_stats_remote_upload(size);
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/worker/mod.rs:
--------------------------------------------------------------------------------
 1 | pub(crate) mod hwmonitor;
 2 | mod reactor;
 3 | pub(crate) mod rpc;
 4 | pub mod rqueue;
 5 | pub mod state;
 6 | pub mod task;
 7 | pub mod task_comm;
 8 | 
 9 | pub mod comm;
10 | pub mod configuration;
11 | pub(crate) mod resources;
12 | 
13 | pub(crate) mod data;
14 | pub(crate) mod localcomm;
15 | 
16 | #[cfg(test)]
17 | mod test_util;
18 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/worker/resources/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod allocator;
2 | pub mod concise;
3 | pub mod map;
4 | pub mod pool;
5 | 


--------------------------------------------------------------------------------
/crates/tako/src/internal/worker/task_comm.rs:
--------------------------------------------------------------------------------
 1 | use crate::launcher::StopReason;
 2 | use tokio::sync::oneshot::Sender;
 3 | 
 4 | /// Allows communicating with a task currently running on the worker.
 5 | pub struct RunningTaskComm {
 6 |     stop_sender: Option<Sender<StopReason>>,
 7 | }
 8 | 
 9 | impl RunningTaskComm {
10 |     pub fn new(stop_sender: Sender<StopReason>) -> Self {
11 |         Self {
12 |             stop_sender: Some(stop_sender),
13 |         }
14 |     }
15 | 
16 |     pub fn send_cancel_notification(&mut self) {
17 |         self.send_stop(StopReason::Cancel);
18 |     }
19 | 
20 |     pub fn send_timeout_notification(&mut self) {
21 |         self.send_stop(StopReason::Timeout);
22 |     }
23 | 
24 |     fn send_stop(&mut self, reason: StopReason) {
25 |         if let Some(sender) = std::mem::take(&mut self.stop_sender) {
26 |             assert!(sender.send(reason).is_ok());
27 |         } else {
28 |             log::debug!("Stopping a task in stopping process");
29 |         }
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/crates/tako/src/program.rs:
--------------------------------------------------------------------------------
 1 | use crate::internal::common::Map;
 2 | use bstr::BString;
 3 | use serde::{Deserialize, Serialize};
 4 | use std::path::PathBuf;
 5 | 
 6 | /// What should happen with a file, once its owning task finishes executing?
 7 | #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Default)]
 8 | #[serde(rename_all = "kebab-case")]
 9 | pub enum FileOnCloseBehavior {
10 |     /// Don't do anything
11 |     #[default]
12 |     None,
13 |     /// Remove the file if its task has finished successfully
14 |     RmIfFinished,
15 | }
16 | 
17 | #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Default)]
18 | pub enum StdioDef {
19 |     #[default]
20 |     Null,
21 |     File {
22 |         path: PathBuf,
23 |         on_close: FileOnCloseBehavior,
24 |     },
25 |     Pipe,
26 | }
27 | 
28 | impl StdioDef {
29 |     pub fn map_filename<F>(self, f: F) -> StdioDef
30 |     where
31 |         F: FnOnce(PathBuf) -> PathBuf,
32 |     {
33 |         match self {
34 |             StdioDef::Null => StdioDef::Null,
35 |             StdioDef::File { path, on_close } => StdioDef::File {
36 |                 path: f(path),
37 |                 on_close,
38 |             },
39 |             StdioDef::Pipe => StdioDef::Pipe,
40 |         }
41 |     }
42 | }
43 | 
44 | #[derive(Serialize, Deserialize, Debug, Clone)]
45 | pub struct ProgramDefinition {
46 |     pub args: Vec<BString>,
47 | 
48 |     #[serde(default)]
49 |     pub env: Map<BString, BString>,
50 | 
51 |     #[serde(default)]
52 |     pub stdout: StdioDef,
53 | 
54 |     #[serde(default)]
55 |     pub stderr: StdioDef,
56 | 
57 |     #[serde(default)]
58 |     #[serde(with = "serde_bytes")]
59 |     pub stdin: Vec<u8>,
60 | 
61 |     #[serde(default)]
62 |     pub cwd: PathBuf,
63 | }
64 | 
65 | const MAX_SHORTENED_BSTRING: usize = 256;
66 | const MAX_SHORTENED_ARGS: usize = 128;
67 | 
68 | impl ProgramDefinition {
69 |     pub fn strip_large_data(&mut self) {
70 |         self.stdin = Vec::new();
71 |         if self.args.len() > MAX_SHORTENED_ARGS {
72 |             self.args.truncate(MAX_SHORTENED_ARGS);
73 |             self.args.shrink_to_fit();
74 |         }
75 |         for arg in &mut self.args {
76 |             if arg.len() > MAX_SHORTENED_BSTRING {
77 |                 *arg = format!("<{} bytes>", arg.len()).into()
78 |             }
79 |         }
80 |     }
81 | }
82 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | <div style="display: flex; justify-content: center;">
 2 |   <img src="imgs/hq.png">
 3 | </div>
 4 | 
 5 | **HyperQueue** is a tool designed to simplify execution of large workflows (task graphs) on HPC clusters. It allows you to execute a large number of tasks in a simple way, without having to manually submit jobs into batch schedulers like Slurm or PBS. You just specify what you want to compute – HyperQueue will automatically ask for computational resources and dynamically load-balance tasks across all allocated nodes and cores. HyperQueue can also work without Slurm/PBS as a general task executor.
 6 | 
 7 | If you use HyperQueue in your research, please
 8 | consider [citing it](https://www.sciencedirect.com/science/article/pii/S2352711024001857).
 9 | 
10 | ## Useful links
11 | - [Installation](installation.md)
12 | - [Quick start](quickstart.md)
13 | - [Python API](python/index.md)
14 | - [Command-line interface reference](cli-reference)
15 | - [Repository](https://github.com/It4innovations/hyperqueue)
16 | - [Discussion forum](https://github.com/It4innovations/hyperqueue/discussions)
17 | - [Zulip (chat platform)](https://hyperqueue.zulipchat.com/)
18 | 
19 | ## Features
20 | **Resource management**
21 | 
22 | - Batch jobs are submitted and managed [automatically](deployment/allocation.md)
23 | - Computation is distributed amongst all allocated nodes and cores
24 | - Tasks can specify [resource requirements](jobs/cresources.md) (# of cores, GPUs, memory, ...)
25 | 
26 | **Performance**
27 | 
28 | - Scales to millions of tasks and hundreds of nodes
29 | - Overhead per task is around 0.1 ms
30 | - Task output can be [streamed](jobs/streaming.md) to a single file to avoid overloading distributed filesystems
31 | 
32 | **Simple deployment**
33 | 
34 | - *HQ* is provided as a single, statically linked [binary](installation.md) without any dependencies
35 | - No admin access to a cluster is needed
36 | 


--------------------------------------------------------------------------------
/docs/changelog.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 | 
3 | This page contains the historical record of changes in various version of HyperQueue. You can use
4 | the select box in the top left corner of the page to view the documentation of a specific HyperQueue
5 | version.
6 | 
7 | --8<-- "CHANGELOG.md"
8 | 


--------------------------------------------------------------------------------
/docs/cheatsheet.md:
--------------------------------------------------------------------------------
1 | # Cheatsheet
2 | Here you can find a cheatsheet with the most basic *HQ* commands. 
3 | 
4 | <div style="display: flex; justify-content: center;">
5 |     <img width="100%" src="../imgs/cheatsheet.png">
6 | </div>
7 | 


--------------------------------------------------------------------------------
/docs/cli-reference/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It4innovations/hyperqueue/aecde5c53dd41843cc566d87e6fe8036ba26a8fa/docs/cli-reference/.gitkeep


--------------------------------------------------------------------------------
/docs/cli/dashboard.md:
--------------------------------------------------------------------------------
 1 | HyperQueue offers a command-line dashboard that shows information about the state of workers and jobs. It can show
 2 | which jobs are currently queued or running, which tasks are running on which workers, or what is the current hardware
 3 | utilization of workers.
 4 | 
 5 | !!! warning
 6 | 
 7 |     The dashboard is currently in an experimental stage. Some of its features might not work properly,
 8 |     and important features might be missing. Please [let us know](https://github.com/It4innovations/hyperqueue/issues)
 9 |     if you encounter any issues with it, or if you want us to add new features to it.
10 | 
11 | You can start the dashboard using the `hq dashboard` command:
12 | ```bash
13 | $ hq dashboard
14 | ```
15 | The dashboard will try to connect to a running HyperQueue server, and display various information. You can navigate
16 | the dashboard using your keyboard.
17 | 
18 | !!! note
19 | 
20 |     You have to enable [journalling](../deployment/server.md#resuming-stoppedcrashed-server) in order to see any data in the dashboard.
21 | 
22 | Here is an example video that shows how does the dashboard look like:
23 | ![](../imgs/dashboard.gif)
24 | 


--------------------------------------------------------------------------------
/docs/deployment/index.md:
--------------------------------------------------------------------------------
 1 | # Architecture
 2 | HyperQueue has two runtime components:
 3 | 
 4 | - **Server**: a long-lived component which can run e.g. on a login node of a computing cluster. It handles task
 5 |   submitted by the user, manages and asks for HPC resources (PBS/Slurm jobs) and distributes tasks to available workers.
 6 | - **Worker**: runs on a computing node and actually executes submitted tasks.
 7 | 
 8 | <div style="display: flex; justify-content: center;">
 9 |   <img src="../imgs/architecture.png" style="width: 500px;">
10 | </div>
11 | 
12 | Server and the workers communicate over encrypted TCP/IP channels. The server may run on any machine, as long as the workers
13 | are able to connect to it over TCP/IP. Connecting in the other direction (from the server machine to the worker nodes) is
14 | not required. A common use-case is to start the server on a login of an HPC system.
15 | 
16 | [comment]: <> (TODO: describe scheduler)
17 | 
18 | Learn more about deploying [server](server.md) and the [workers](worker.md).
19 | 


--------------------------------------------------------------------------------
/docs/imgs/architecture-bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It4innovations/hyperqueue/aecde5c53dd41843cc566d87e6fe8036ba26a8fa/docs/imgs/architecture-bg.png


--------------------------------------------------------------------------------
/docs/imgs/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It4innovations/hyperqueue/aecde5c53dd41843cc566d87e6fe8036ba26a8fa/docs/imgs/architecture.png


--------------------------------------------------------------------------------
/docs/imgs/cheatsheet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It4innovations/hyperqueue/aecde5c53dd41843cc566d87e6fe8036ba26a8fa/docs/imgs/cheatsheet.png


--------------------------------------------------------------------------------
/docs/imgs/dashboard.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It4innovations/hyperqueue/aecde5c53dd41843cc566d87e6fe8036ba26a8fa/docs/imgs/dashboard.gif


--------------------------------------------------------------------------------
/docs/imgs/hq-comparison-table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It4innovations/hyperqueue/aecde5c53dd41843cc566d87e6fe8036ba26a8fa/docs/imgs/hq-comparison-table.png


--------------------------------------------------------------------------------
/docs/imgs/hq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It4innovations/hyperqueue/aecde5c53dd41843cc566d87e6fe8036ba26a8fa/docs/imgs/hq.png


--------------------------------------------------------------------------------
/docs/imgs/schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It4innovations/hyperqueue/aecde5c53dd41843cc566d87e6fe8036ba26a8fa/docs/imgs/schema.png


--------------------------------------------------------------------------------
/docs/imgs/streaming.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It4innovations/hyperqueue/aecde5c53dd41843cc566d87e6fe8036ba26a8fa/docs/imgs/streaming.png


--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
 1 | ## Binary distribution (recommended)
 2 | The easiest way to install **HyperQueue** is to download and unpack the prebuilt `hq` executable:
 3 | 
 4 | 1. Download the latest release archive from [this link](https://github.com/It4innovations/hyperqueue/releases/latest).
 5 | 
 6 |     !!! info "Target architecture"
 7 |     
 8 |         Make sure to choose the correct binary for your architecture. Currently, we provide prebuilt binaries for `x86-64`
 9 |         and `PowerPC` architectures.
10 | 
11 | 2. Unpack the downloaded archive:
12 | 
13 |     ```bash
14 |     $ tar -xvzf hq-<version>-linux-<architecture>.tar.gz
15 |     ```
16 |     
17 |     The archive contains a single binary `hq`, which is used both for deploying the *HQ* cluster and submitting tasks into *HQ*.
18 |     You can add `hq` to your system `$PATH` to make its usage easier.
19 | 
20 | See [Quickstart](quickstart.md) for an example "Hello world" HyperQueue computation.
21 | 
22 | ## Compilation from source code
23 | 
24 | You can also compile HyperQueue from source. This allows you to build HyperQueue for architectures for which we do not
25 | provide prebuilt binaries. It can also generate binaries with support for vectorization, which could in theory improve
26 | the performance of HyperQueue in extreme cases.
27 | 
28 | 1. Setup a [Rust toolchain](https://www.rust-lang.org/tools/install)
29 | 2. Clone the HyperQueue repository:
30 | 
31 |     ```bash
32 |     $ git clone https://github.com/It4innovations/hyperqueue/
33 |     ```
34 | 
35 | 3. Build HyperQueue:
36 | 
37 |     ```bash
38 |     $ RUSTFLAGS="-C target-cpu=native" cargo build --release
39 |     ```
40 | 
41 |     ??? tip "Jemalloc dependency"
42 |     
43 |         HyperQueue by default depends on the [Jemalloc](https://github.com/jemalloc/jemalloc) memory allocator, which is a
44 |         C library. If you're having problems with installing HyperQueue because of this dependency, you can opt-out of it and
45 |         use the default system allocator by building *HQ* with `--no-default-features`:
46 |     
47 |         ```bash
48 |         $ cargo build --release --no-default-features
49 |         ```
50 | 
51 | 5. Use the executable located in `./target/release/hq`
52 | 


--------------------------------------------------------------------------------
/docs/jobs/directives.md:
--------------------------------------------------------------------------------
 1 | # Directives
 2 | You can specify job parameters using special comments (`directives`) specified in a submitted shell
 3 | script. Directives are lines that begin with the `#HQ` prefix. Any text following this prefix will
 4 | be interpreted as a command line argument for `hq submit`.
 5 | 
 6 | ## Example directive file
 7 | 
 8 | Suppose that `script.sh` has the following content:
 9 | ```bash
10 | #!/bin/bash
11 | 
12 | #HQ --name=Example
13 | #HQ --cpus="2 compact" --pin taskset
14 | 
15 | ./my-program
16 | ```
17 | 
18 | If you execute
19 | ```bash
20 | $ hq submit script.sh
21 | ```
22 | it will behave as if you have executed
23 | ```bash
24 | $ hq submit --name=Example --cpus="2 compact" --pin taskset script.sh
25 | ```
26 | 
27 | ## Directives mode
28 | You can select three modes using the `--directives` flag of `hq submit`. The mode will
29 | determine when should HyperQueue attempt to parse directives from the provided command.
30 | 
31 | * `auto` (default) - Directives will be parsed if the first command passed to `hq submit` has the
32 |   `.sh` extension.
33 | * `file` - Directives will be parsed from the first command passed to `hq submit`.
34 | * `stdin` - Directives will be parsed from stdin (see ``--stdin``) 
35 | * `off` - Directives will not be parsed.
36 | 
37 | !!! tip
38 | 
39 |     When HQ parses directives from a file, it will also try to parse a shebang line from the
40 |     script and use it to select an interpreter for running the script.
41 | 
42 | ## Notes
43 | 
44 | * Directives have to be defined at the beginning of the file. Only comments or empty lines are allowed
45 |   to precede the directives.   
46 | * Directives have to be defined in the first 32KiB of the file, the rest of the file is ignored.
47 | * Parameters set via CLI have precedence over parameters set via direectives:
48 |     * Parameters that cannot occur multiple times (like `--name`) **will be overriden** by values set from CLI.
49 |     * Parameters that can occur multiple times (like `--resource`) will be combined from CLI and from directives.
50 | * A script may contain more lines with the `#HQ` prefix, such lines are combined and evaluated as a
51 | continuous list of parameters.
52 | 


--------------------------------------------------------------------------------
/docs/jobs/explain.md:
--------------------------------------------------------------------------------
 1 | Sometimes, you may have connected workers and submitted tasks, but tasks are not running, and it is not clear why. For
 2 | this purpose, there is an `explain` command in HQ.
 3 | 
 4 | It can be used as follows:
 5 | 
 6 | `hq task explain <JOB_ID> <TASK_ID>`
 7 | 
 8 | This command provides information about whether the given task can run on workers.
 9 | If it cannot run, it returns explanation why. The explanation considers the following areas:
10 | 
11 | * Resources
12 | * Time request and remaining worker time
13 | * Size of the worker group for multi-node tasks
14 | * Task dependencies
15 | * If a task has multiple resource variants, the explanation is provided for each variant.
16 | 
17 | Note: `explain` considers runtime information; therefore, it works only for waiting/running tasks and live workers.
18 | 


--------------------------------------------------------------------------------
/docs/overrides/main.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | 
3 | {% block outdated %}
4 | You're not viewing the latest version.
5 | <a href="{{ '../' ~ base_url }}">
6 |   <b>Click here to go to the latest version.</b>
7 | </a>
8 | {% endblock %}
9 | 


--------------------------------------------------------------------------------
/docs/python/client.md:
--------------------------------------------------------------------------------
 1 | # Client
 2 | To submit [jobs](../jobs/jobs.md) using the Python API, you first need to create
 3 | a [`Client`](hyperqueue.client.Client) that connects to a running HyperQueue cluster. You have two
 4 | options of deploying the cluster. Once you have an instance of a `Client`, you can use it to
 5 | [submit](submit.md) a job.
 6 | 
 7 | ## Using external deployment
 8 | If you want to run the HyperQueue infrastructure on a distributed cluster or you want to use
 9 | [automatic allocation](../deployment/allocation.md), then [deploy](../deployment/index.md)
10 | HyperQueue in any of the supported ways and then pass
11 | the [server directory](../deployment/server.md#server-directory) to the `Client`:
12 | 
13 | ```python
14 | from hyperqueue import Client
15 | 
16 | client = Client("/home/user/.hq-server/hq-current")
17 | ```
18 | 
19 | If you have used the default server directory and the server is deployed on a file-system shared by
20 | the node that executes the Python code, you can simply create an instance of a `Client` without
21 | passing any parameters.
22 | 
23 | ## Using a local cluster
24 | You can use the [`LocalCluster`](hyperqueue.cluster.LocalCluster)
25 | class to spawn a HyperQueue server and a set of workers directly on your local machine.
26 | This functionality is primarily intended for local prototyping and debugging, but it can also be
27 | used for actual computations for simple use-cases that do not require a distributed deployment of
28 | HyperQueue.
29 | 
30 | When you create the cluster, it will initially only start the HyperQueue server. To connect workers
31 | to it, use the [`start_worker`](hyperqueue.cluster.LocalCluster#f_start_worker) method.
32 | 
33 | ```python
34 | from hyperqueue import LocalCluster
35 | from hyperqueue.cluster import WorkerConfig
36 | 
37 | with LocalCluster() as cluster:
38 |     # Add a worker with 4 cores to the cluster
39 |     cluster.start_worker(WorkerConfig(cores=4))
40 | 
41 |     # Create a client connected to the cluster
42 |     client = cluster.client()
43 | ```
44 | 
45 | !!! tip
46 | 
47 |     You can use `LocalCluster` instances as context managers to make sure that the
48 |     cluster is properly cleaned up at the end of the `with` block.
49 | 


--------------------------------------------------------------------------------
/docs/python/dependencies.md:
--------------------------------------------------------------------------------
 1 | # Task dependencies
 2 | One of the most useful features of the HyperQueue Python API is that it allows you to define
 3 | dependencies between individual tasks of a job.
 4 | 
 5 | If a task `B` **depends** on task `A`, then `B` will not be executed until `A` has (successfully)
 6 | finished. Using dependencies, you can describe arbitrarily complex DAG (directed acyclic graph)
 7 | workflows.
 8 | 
 9 | !!! notice
10 | 
11 |     HyperQueue jobs are independent of each other, so dependencies can only be specified between tasks
12 |     within a single job.
13 | 
14 | ## Defining dependencies
15 | To define a dependency between tasks, you will first need to store the
16 | [`Task`](hyperqueue.task.task.Task) instances that you get when you create a [task](submit.md#tasks).
17 | You can then use the `deps` parameter when creating a new task and pass an existing task instance
18 | to define a dependency:
19 | 
20 | ```python
21 | from hyperqueue import Job
22 | 
23 | job = Job()
24 | 
25 | # Create a first task that generates data
26 | task_a = job.program(["generate-data", "--file", "out.txt"])
27 | 
28 | # Create a dependent task that consumes the data
29 | job.program(["consume-data", "--file", "out.txt"], deps=[task_a])
30 | ```
31 | 
32 | The second task will not be started until the first one successfully finishes.
33 | 
34 | You can also depend on multiple tasks at once:
35 | ```python
36 | # Create several tasks that generate data
37 | tasks = [job.program([
38 |     "generate-data",
39 |     "--file",
40 |     f"out-{i}.txt"
41 | ]) for i in range(5)]
42 | 
43 | # Create a dependent task that consumes the data
44 | job.program(["consume-data", "--file", "out-%d.txt"], deps=[tasks])
45 | ```
46 | 
47 | Dependencies are transitive, so you can build an arbitrary graph:
48 | ```python
49 | task_a = job.program(["generate", "1"])
50 | task_b = job.program(["generate", "2"])
51 | 
52 | task_c = job.program(["compute"], deps=[task_a, task_b])
53 | 
54 | task_d = job.program(["postprocess"], deps=[task_c])
55 | ```
56 | In this case, task `D` will not start until all the three previous tasks are successfully finished.
57 | 


--------------------------------------------------------------------------------
/docs/python/index.md:
--------------------------------------------------------------------------------
 1 | # Python API
 2 | To provide greater flexibility and support use-cases that are difficult to express using the CLI
 3 | such as dynamically submitting tasks when some part is finished.
 4 | Python API covers all task definition including all options available through Job Definition File
 5 | (dependencies between tasks, resource variants, etc)
 6 | 
 7 | You can find the HyperQueue Python API reference [here](apidoc).
 8 | 
 9 | ## Requirements
10 | To use the Python API, you will need at least Python 3.9 and some dependencies that will be installed
11 | automatically using pip.
12 | 
13 | ## Installation
14 | You can install the HyperQueue Python API from `PyPi` with the following command:
15 | 
16 | ```bash
17 | $ python3 -m pip install hyperqueue
18 | ```
19 | 
20 | The Python package contains a pre-compiled version of HyperQueue, so you do not have to download `hq`
21 | manually if you just want to use the Python API.
22 | 
23 | !!! warning
24 | 
25 |     The Python API is currently distributed only for the `x86-x64` architecture. If you need a build
26 |     for another architecture, please
27 |     [contact us](https://github.com/It4innovations/hyperqueue/issues/new) on GitHub.
28 | 
29 |     You can also build the Python package manually from our GitHub repository, but you will need to
30 |     install a Rust toolchain for that.
31 | 
32 | ## Quick start
33 | Here is a minimal code example that spawns a local HyperQueue cluster and uses it to submit
34 | a simple job:
35 | 
36 | ```python
37 | from hyperqueue import Job, LocalCluster
38 | 
39 | # Spawn a HQ server
40 | with LocalCluster() as cluster:
41 |     # Add a single HyperQueue worker to the server
42 |     cluster.start_worker()
43 | 
44 |     # Create a client and a job
45 |     client = cluster.client()
46 |     job = Job()
47 | 
48 |     # Add a task that executes `ls` to the job
49 |     job.program(["ls"])
50 | 
51 |     # Submit the job
52 |     submitted = client.submit(job)
53 | 
54 |     # Wait until the job completes
55 |     client.wait_for_jobs([submitted])
56 | ```
57 | 


--------------------------------------------------------------------------------
/docs/quickstart.md:
--------------------------------------------------------------------------------
 1 | Here we provide an example of deploying *HyperQueue* on a local computer and running a simple "Hello world" script.
 2 | 
 3 | Run each of the following three commands in separate terminals.
 4 | 
 5 | 1. **Start the HyperQueue server**
 6 | 
 7 |     ```bash
 8 |     $ hq server start
 9 |     ```
10 | 
11 |     The [server](deployment/server.md) will manage computing resources (workers) and distribute submitted tasks amongst
12 |     them.
13 | 
14 | 2. **Start a HyperQueue worker**
15 | 
16 |     ```bash
17 |     $ hq worker start
18 |     ```
19 | 
20 |     The [worker](deployment/worker.md) will connect to the server and execute submitted tasks.
21 | 
22 | 3. **Submit a simple computation**
23 | 
24 |     ```bash
25 |     $ hq submit echo "Hello world"
26 |     ```
27 | 
28 |     This command will submit a [job](jobs/jobs.md#identification-numbers) with a single task that will execute `echo "Hello world"` on a worker. You
29 |     can find the output of the task in `job-1/0.stdout`.
30 | 
31 | That's it! For a more in-depth explanation of how HyperQueue works and what it can do, check
32 | the [Deployment](deployment/index.md) and [Jobs](jobs/jobs.md) sections.
33 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | mkdocs==1.6.1
 2 | mkdocs-material==9.6.14
 3 | mkdocs-minify-plugin==0.7.1
 4 | mkdocs-git-revision-date-localized-plugin==1.4.7
 5 | mike==2.1.3
 6 | requests==2.32.4
 7 | jinja2==3.1.6
 8 | nedoc @ git+https://github.com/spirali/nedoc@6994f657be3c110a4b3be6df99682206e1b1181e
 9 | mkdocs-nedoc-plugin @ git+https://github.com/kobzol/mkdocs-nedoc-plugin@7c5ecc3b8a975886c02779d4f9c013882a0ab6a8
10 | 


--------------------------------------------------------------------------------
/docs/stylesheets/extra.css:
--------------------------------------------------------------------------------
 1 | :root {
 2 |     --md-primary-fg-color: #2569A7;
 3 |     /*--md-primary-fg-color--light: #861f4194;*/
 4 |     /*--md-primary-fg-color--dark: #ac325a;*/
 5 |     /*--md-primary-bg-color: hsla(0, 0%, 100%, 1);*/
 6 |     /*--md-primary-bg-color--light: hsla(0, 0%, 100%, 0.7);*/
 7 |     /*--md-text-link-color: hsla(231, 48%, 48%, 1);*/
 8 | 
 9 |     /* Accent color shades */
10 |     /*--md-accent-fg-color: rgb(98, 18, 189);*/
11 |     /*--md-accent-fg-color--transparent: hsla(189, 100%, 37%, 0.1);*/
12 |     /*--md-accent-bg-color: hsla(0, 0%, 100%, 1);*/
13 |     /*--md-accent-bg-color--light: hsla(0, 0%, 100%, 0.7);*/
14 | }
15 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples
2 | Here you can find several examples of how HyperQueue can be used for various use-cases, both with the command-line
3 | interface and also with the Python API.
4 | 
5 | You can view these examples either in the [documentation](https://it4innovations.github.io/hyperqueue/stable/examples/iterative-computation/)
6 | or on [GitHub](https://github.com/It4innovations/hyperqueue/tree/main/examples).
7 | 
8 | - [Iterative computation](./iterative-computation/README.md)
9 | 


--------------------------------------------------------------------------------
/examples/iterative-computation/README.md:
--------------------------------------------------------------------------------
 1 | # Iterative computation
 2 | It is a common use-case to perform an iterative computation, e.g. run a randomized simulation until the results are
 3 | stable/accurate enough, or train a machine learning model while the loss keeps dropping.
 4 | 
 5 | While there is currently no built-in support in HQ for iteratively submitting new tasks to an existing job, you can perform
 6 | an iterative computation relatively easily with the following approach:
 7 | 
 8 | 1. Submit a HQ job that performs a computation
 9 | 2. Wait for the job to finish
10 | 3. Read the output of the job and decide if computation should continue
11 | 4. If yes, go to 1.
12 | 
13 | # Python API
14 | With the Python API, we can simply write the outermost iteration loop in Python, and repeatedly submit jobs, until some
15 | end criterion has been achieved:
16 | 
17 | ```python
18 | from hyperqueue import Job, Client
19 | 
20 | client = Client()
21 | 
22 | while True:
23 |     job = Job()
24 |     job.program(["my-program"], stdout="out.txt")
25 | 
26 |     # Submit a job
27 |     submitted = client.submit(job)
28 | 
29 |     # Wait for it to complete
30 |     client.wait_for_jobs([submitted])
31 | 
32 |     # Read the output of the job
33 |     with open("out.txt") as f:
34 |         # Check some termination condition and eventually end the loop
35 |         if f.read().strip() == "done":
36 |             break
37 | ```
38 | 
39 | # Command-line interface
40 | With the command-line interface, you can perform the iterative loop e.g. in Bash.
41 | 
42 | ```bash
43 | #!/bin/bash
44 | 
45 | while :
46 | do
47 |   # Submit a job and wait for it to complete
48 |   ./hq submit --wait ./compute.sh
49 |   
50 |   # Read the output of the job
51 |   output=$(./hq job cat last stdout)
52 | 
53 |   # Decide if we should end or continue
54 |   if [ "${output}" -eq 0 ]; then
55 |       break
56 |   fi
57 | done
58 | ```
59 | 


--------------------------------------------------------------------------------
/nedoc.conf:
--------------------------------------------------------------------------------
 1 | # Nedoc configuration file
 2 | 
 3 | [main]
 4 | project_name = hyperqueue
 5 | project_version = 0.22.0
 6 | 
 7 | source_path = crates/pyhq/python/hyperqueue
 8 | target_path = ./docs/apidoc
 9 | 
10 | # Style of docstrings, possible values: none, numpy, google, rst
11 | style = rst
12 | 
13 | # Markup in docstrings, possible values: none, rst, markdown
14 | markup = markdown
15 | 
16 | # Minimize the resulting HTML files
17 | minimize_output = True
18 | 
19 | # --- Extra options -----------------------------------------------
20 | 
21 | # Use __init__ method docstring for class when it does have its own
22 | copy_init_docstring = True
23 | 
24 | # ignore_paths = []
25 | # Use for ignoring files or directories
26 | # E.g.: ignore_paths = ["module1/myfile.py", "module2"]
27 | 
28 | # Creates mapping "map.json" that contains mapping between
29 | # Python identifiers and documentation
30 | create_map_json = True
31 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers =
3 |     pbs
4 |     slurm
5 | 


--------------------------------------------------------------------------------
/ruff.toml:
--------------------------------------------------------------------------------
 1 | exclude = [
 2 |     ".git",
 3 |     ".git-rewrite",
 4 |     ".ipynb_checkpoints",
 5 |     ".pytest_cache",
 6 |     ".ruff_cache",
 7 |     "__pypackages__",
 8 |     "_build",
 9 |     "buck-out",
10 |     "build",
11 |     "dist",
12 |     "node_modules",
13 |     "site-packages",
14 |     "venv",
15 |     "target",
16 | ]
17 | include = [
18 |     "benchmarks/**/*.py",
19 |     "crates/pyhq/python/**/*.py",
20 |     "crates/pyhq/pyproject.toml",
21 |     "scripts/**/*.py",
22 |     "tests/**/*.py",
23 | ]
24 | 
25 | line-length = 120
26 | indent-width = 4
27 | 
28 | target-version = "py37"
29 | 
30 | [lint]
31 | select = ["E4", "E7", "E9", "F"]
32 | ignore = ["E203", "E722"]
33 | 


--------------------------------------------------------------------------------
/scripts/bless_tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -e
4 | 
5 | cd `dirname $0`/..
6 | 
7 | python3 -m pytest tests --inline-snapshot=fix
8 | 


--------------------------------------------------------------------------------
/scripts/check.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | cd `dirname $0`/..
 6 | 
 7 | # Format Rust code
 8 | cargo fmt --all
 9 | 
10 | # Format Python code
11 | ruff format
12 | 
13 | # Lint Python code
14 | ruff check
15 | 
16 | # Test Rust code
17 | cargo test
18 | 
19 | # Build Rust binaries
20 | cargo build --all
21 | 
22 | # Build Python binding
23 | maturin develop --manifest-path crates/pyhq/Cargo.toml --extras all
24 | 
25 | # Test Python code
26 | python -m pytest tests -n32
27 | 
28 | # Lint Rust code
29 | cargo clippy --all -- -D warnings
30 | cargo check --all --all-targets
31 | cargo check --benches
32 | 


--------------------------------------------------------------------------------
/scripts/check_package_versions.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import subprocess
 3 | 
 4 | if __name__ == "__main__":
 5 |     """
 6 |     Checks that HyperQueue and its binding have the same version.
 7 |     """
 8 |     output = subprocess.check_output(["cargo", "metadata", "--no-deps", "-q"]).decode().strip()
 9 |     metadata = json.loads(output)
10 |     packages = metadata["packages"]
11 |     hq_version = [p["version"] for p in packages if p["name"] == "hyperqueue"][0]
12 |     pyhq_version = [p["version"] for p in packages if p["name"] == "pyhq"][0]
13 | 
14 |     if hq_version != pyhq_version:
15 |         raise Exception(f"Hyperqueue has a different version ({hq_version}) than its Python binding ({pyhq_version})")
16 | 


--------------------------------------------------------------------------------
/scripts/docs/build_cli_reference.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Build a complete CLI reference using the https://github.com/spirali/cli_doc tool.
 3 | """
 4 | 
 5 | import os
 6 | import shutil
 7 | import subprocess
 8 | import tempfile
 9 | 
10 | from mkdocs.config import Config
11 | from mkdocs.structure.files import File, Files
12 | 
13 | 
14 | def is_cli_doc_available() -> bool:
15 |     return shutil.which("cli_doc") is not None
16 | 
17 | 
18 | def on_files(files: Files, config: Config):
19 |     if not is_cli_doc_available():
20 |         if os.environ.get("GITHUB_ACTIONS", "0") == "1":
21 |             raise Exception("cli_doc not available in CI")
22 |         print("Skipping generation of CLI reference, because `cli_doc` is not installed")
23 |         return
24 | 
25 |     hq_path = os.environ.get("HQ_PATH", "target/debug/hq")
26 |     print("Generating CLI reference")
27 | 
28 |     with tempfile.NamedTemporaryFile() as tmp_file:
29 |         subprocess.run(["cli_doc", hq_path, "--output-filename", tmp_file.name], check=True)
30 |         with open(tmp_file.name) as src:
31 |             content = src.read()
32 |         files.append(File.generated(config, "cli-reference/index.html", content=content))
33 | 


--------------------------------------------------------------------------------
/scripts/docs/copy_examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copy all files from the `examples` directory to `<built-docs>/examples`, so that they can be rendered in the
 3 | documentation.
 4 | """
 5 | 
 6 | import glob
 7 | import os.path
 8 | 
 9 | from mkdocs.config import Config
10 | from mkdocs.structure.files import File, Files
11 | 
12 | 
13 | def on_files(files: Files, config: Config):
14 |     file_count = 0
15 |     for path in glob.glob("examples/**/*", recursive=True):
16 |         if os.path.isfile(path):
17 |             with open(path) as src_file:
18 |                 content = src_file.read()
19 | 
20 |             file_count += 1
21 |             files.append(File.generated(config, path, content=content))
22 |     print(f"Copied {file_count} files from the examples directory")
23 | 


--------------------------------------------------------------------------------
/scripts/extract_changelog.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from os.path import abspath, dirname, join
 3 | 
 4 | CURRENT_DIR = dirname(abspath(__file__))
 5 | CHANGELOG_PATH = join(dirname(CURRENT_DIR), "CHANGELOG.md")
 6 | 
 7 | 
 8 | def normalize(version: str) -> str:
 9 |     return version.strip().lstrip("v").lower()
10 | 
11 | 
12 | def get_matching_lines(text: str, tag: str):
13 |     lines = list(text.splitlines(keepends=False))
14 |     for index, line in enumerate(lines):
15 |         if line.startswith("## "):
16 |             version = normalize(line.lstrip("## "))
17 |             if version == tag:
18 |                 for matching_line in lines[index + 1 :]:
19 |                     if matching_line.startswith("## "):
20 |                         return
21 |                     # Reduce one level of heading indentation
22 |                     if matching_line.startswith("###"):
23 |                         matching_line = matching_line[1:]
24 |                     yield matching_line
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     if len(sys.argv) < 2:
29 |         print("Usage: python extract_changelog <tag>")
30 |         exit(1)
31 | 
32 |     tag = normalize(sys.argv[1])
33 |     with open(CHANGELOG_PATH) as f:
34 |         text = f.read()
35 | 
36 |     lines = list(get_matching_lines(text, tag))
37 |     if not lines and "-" in tag:
38 |         # Try to find a version without pre-release modifier
39 |         lines = list(get_matching_lines(text, tag[: tag.index("-")]))
40 | 
41 |     output = f"# HyperQueue {tag}\n"
42 |     for line in lines:
43 |         output += f"{line}\n"
44 |     output += f"""
45 | # Artifact summary:
46 | - **hq-v{tag}-\\***: Main HyperQueue build containing the `hq` binary. **Download this archive to
47 | use HyperQueue from the command line**.
48 | - **hyperqueue-{tag}-\\***: Wheel containing the `hyperqueue` package with HyperQueue Python
49 | bindings.
50 | """
51 |     print(output)
52 | 


--------------------------------------------------------------------------------
/scripts/get_docs_version.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import subprocess
 3 | 
 4 | 
 5 | def latest_version():
 6 |     return {"type": "latest"}
 7 | 
 8 | 
 9 | def get_version(output: str):
10 |     if not output:
11 |         return latest_version()
12 |     else:
13 |         tags = [t.strip() for t in output.splitlines(keepends=False) if t.startswith("v")]
14 |         # Ignore pre-release versions
15 |         tags = [tag for tag in tags if "-" not in tag]
16 |         if not tags:
17 |             return latest_version()
18 |         tags = sorted(tags)
19 |         tag = tags[0]
20 |         return {"type": "stable", "version": tag}
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     """
25 |     Calculates whether the current commit is a stable version (=there is some tag pointing to it) or
26 |     an unstable one.
27 |     """
28 |     output = subprocess.check_output(["git", "tag", "--points-at", "HEAD"]).decode().strip()
29 |     version = get_version(output)
30 |     print(json.dumps(version))
31 | 


--------------------------------------------------------------------------------
/scripts/print_vers.py:
--------------------------------------------------------------------------------
 1 | # This scripts writes all places where version has to be updated when a new release is made
 2 | 
 3 | import os
 4 | 
 5 | ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 6 | 
 7 | 
 8 | def check_file(path, prefix, case_sensitive=False):
 9 |     with open(os.path.join(ROOT_DIR, path), "r") as f:
10 |         for line in f:
11 |             line = line.strip()
12 |             if case_sensitive:
13 |                 line = line.lower()
14 |             if line.startswith(prefix):
15 |                 print(path, ":", line)
16 | 
17 | 
18 | check_file("crates/hyperqueue/Cargo.toml", "version")
19 | check_file("crates/pyhq/Cargo.toml", "version")
20 | check_file("nedoc.conf", "project_version")
21 | check_file("CHANGELOG.md", "# dev", case_sensitive=True)
22 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
 1 | # HyperQueue test suite
 2 | This directory contains the Python test suite of HyperQueue.
 3 | 
 4 | ## Usage
 5 | The following commands are supposed to be executed from the root HyperQueue directory.
 6 | 
 7 | 1) Install `pytest` and other dependencies
 8 |     ```bash
 9 |     $ python -m pip install tests/requirements.txt
10 |     ```
11 | 2) Run tests
12 |     ```bash
13 |     $ python -m pytest tests
14 |     ```
15 | 
16 | You can speed up test execution by running them in parallel:
17 | ```bash
18 | $ python -m pytest tests -n16
19 | ```
20 | 
21 | ### Running autoalloc tests
22 | There are several tests for the automatic allocator that require the presence of an external service (PBS).
23 | If you are on a system that has these services installed, you can run these tests with the `pbs` mark:
24 | ```bash
25 | $ python -m pytest tests -m pbs
26 | ```
27 | 
28 | ## Blessing
29 | You can bless tests with the following command:
30 | ```bash
31 | $ python -m pytest --inline-snapshot=create
32 | ```
33 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It4innovations/hyperqueue/aecde5c53dd41843cc566d87e6fe8036ba26a8fa/tests/__init__.py


--------------------------------------------------------------------------------
/tests/autoalloc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It4innovations/hyperqueue/aecde5c53dd41843cc566d87e6fe8036ba26a8fa/tests/autoalloc/__init__.py


--------------------------------------------------------------------------------
/tests/autoalloc/flavor.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from .mock.manager import ManagerAdapter, CommandHandler
 4 | from .mock.pbs import PbsAdapter
 5 | from .mock.slurm import SlurmAdapter
 6 | from .utils import ManagerType
 7 | 
 8 | 
 9 | class ManagerFlavor:
10 |     """
11 |     Represents a specific flavor of a manager (PBS/Slurm).
12 |     It should be able to `adapt` any manager to an interface used by that flavor.
13 |     """
14 | 
15 |     def manager_type(self) -> ManagerType:
16 |         raise NotImplementedError
17 | 
18 |     def submit_program_name(self) -> str:
19 |         raise NotImplementedError
20 | 
21 |     def create_adapter(self) -> ManagerAdapter:
22 |         raise NotImplementedError
23 | 
24 |     def default_handler(self) -> CommandHandler:
25 |         return CommandHandler(self.create_adapter())
26 | 
27 | 
28 | class PbsManagerFlavor(ManagerFlavor):
29 |     def manager_type(self) -> ManagerType:
30 |         return "pbs"
31 | 
32 |     def submit_program_name(self) -> str:
33 |         return "qsub"
34 | 
35 |     def create_adapter(self) -> ManagerAdapter:
36 |         return PbsAdapter()
37 | 
38 | 
39 | class SlurmManagerFlavor(ManagerFlavor):
40 |     def manager_type(self) -> ManagerType:
41 |         return "slurm"
42 | 
43 |     def submit_program_name(self) -> str:
44 |         return "sbatch"
45 | 
46 |     def create_adapter(self) -> ManagerAdapter:
47 |         return SlurmAdapter()
48 | 
49 | 
50 | def all_flavors(fn):
51 |     """
52 |     Test fixture that generates all available manager flavors.
53 |     """
54 |     return pytest.mark.parametrize(
55 |         "flavor",
56 |         (
57 |             SlurmManagerFlavor(),
58 |             PbsManagerFlavor(),
59 |         ),
60 |         ids=["slurm", "pbs"],
61 |     )(fn)
62 | 


--------------------------------------------------------------------------------
/tests/autoalloc/mock/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It4innovations/hyperqueue/aecde5c53dd41843cc566d87e6fe8036ba26a8fa/tests/autoalloc/mock/__init__.py


--------------------------------------------------------------------------------
/tests/job/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It4innovations/hyperqueue/aecde5c53dd41843cc566d87e6fe8036ba26a8fa/tests/job/__init__.py


--------------------------------------------------------------------------------
/tests/job/test_file_cleanup.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | from ..conftest import HqEnv
 6 | from ..utils import wait_for_job_state
 7 | from ..utils.cmd import python
 8 | from ..utils.io import create_file
 9 | from ..utils.job import default_task_output
10 | 
11 | 
12 | def program_wait_for_file_exists(file_name: str) -> str:
13 |     return f"""
14 | import os.path
15 | import time
16 | 
17 | while True:
18 |     if os.path.isfile("{file_name}"):
19 |         break
20 |     time.sleep(0.1)
21 | """
22 | 
23 | 
24 | @pytest.mark.parametrize("stream", ("stdout", "stderr"))
25 | def test_cleanup_stdout_on_success_default_path(hq_env: HqEnv, stream: str):
26 |     hq_env.start_server()
27 |     hq_env.start_worker()
28 |     hq_env.command(
29 |         [
30 |             "submit",
31 |             f"--{stream}=:rm-if-finished",
32 |             "--",
33 |             *python(program_wait_for_file_exists("finish")),
34 |         ]
35 |     )
36 | 
37 |     output = default_task_output(type=stream)
38 |     wait_for_job_state(hq_env, 1, "RUNNING")
39 |     assert Path(output).is_file()
40 |     create_file("finish")
41 | 
42 |     wait_for_job_state(hq_env, 1, "FINISHED")
43 |     assert not Path(output).is_file()
44 | 
45 | 
46 | @pytest.mark.parametrize("stream", ("stdout", "stderr"))
47 | def test_cleanup_stdout_on_success_custom_path(hq_env: HqEnv, stream: str):
48 |     hq_env.start_server()
49 |     hq_env.start_worker()
50 | 
51 |     output = "foo.txt"
52 |     hq_env.command(
53 |         [
54 |             "submit",
55 |             f"--{stream}={output}:rm-if-finished",
56 |             "--",
57 |             *python(program_wait_for_file_exists("finish")),
58 |         ]
59 |     )
60 | 
61 |     wait_for_job_state(hq_env, 1, "RUNNING")
62 |     assert Path(output).is_file()
63 |     create_file("finish")
64 | 
65 |     wait_for_job_state(hq_env, 1, "FINISHED")
66 |     assert not Path(output).is_file()
67 | 
68 | 
69 | @pytest.mark.parametrize("stream", ("stdout", "stderr"))
70 | def test_do_not_cleanup_stdout_on_fail(hq_env: HqEnv, stream: str):
71 |     hq_env.start_server()
72 |     hq_env.start_worker()
73 |     hq_env.command(["submit", f"--{stream}=:rm-if-finished", "--", "/non-existent"])
74 | 
75 |     output = default_task_output(type=stream)
76 |     wait_for_job_state(hq_env, 1, "FAILED")
77 |     assert Path(output).is_file()
78 | 


--------------------------------------------------------------------------------
/tests/output/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It4innovations/hyperqueue/aecde5c53dd41843cc566d87e6fe8036ba26a8fa/tests/output/__init__.py


--------------------------------------------------------------------------------
/tests/output/test_quiet.py:
--------------------------------------------------------------------------------
 1 | from ..conftest import HqEnv
 2 | from ..utils import wait_for_job_state
 3 | 
 4 | 
 5 | def test_print_worker_list(hq_env: HqEnv):
 6 |     hq_env.start_server()
 7 |     for i in range(9):
 8 |         hq_env.start_worker()
 9 |     output = hq_env.command(["--output-mode=quiet", "worker", "list"])
10 |     output = output.splitlines(keepends=False)
11 |     assert output == [f"{id + 1} RUNNING" for id in range(9)]
12 | 
13 | 
14 | def test_print_job_list(hq_env: HqEnv):
15 |     hq_env.start_server()
16 |     hq_env.start_worker()
17 |     for i in range(9):
18 |         hq_env.command(["submit", "echo", "tt"])
19 | 
20 |     wait_for_job_state(hq_env, list(range(1, 10)), "FINISHED")
21 | 
22 |     output = hq_env.command(["--output-mode=quiet", "job", "list", "--all"])
23 |     output = output.splitlines(keepends=False)
24 |     assert output == [f"{id + 1} FINISHED" for id in range(9)]
25 | 
26 | 
27 | def test_submit(hq_env: HqEnv):
28 |     hq_env.start_server()
29 |     output = hq_env.command(["--output-mode=quiet", "submit", "echo", "tt"])
30 |     assert output == "1\n"
31 | 


--------------------------------------------------------------------------------
/tests/pyapi/__init__.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | from typing import Tuple
 3 | 
 4 | from hyperqueue.client import Client
 5 | from hyperqueue.job import Job
 6 | 
 7 | from hyperqueue import LocalCluster
 8 | 
 9 | from ..conftest import HqEnv
10 | from ..utils.mock import ProgramMock
11 | 
12 | 
13 | def prepare_job_client(hq_env: HqEnv, with_worker=True, **job_args) -> Tuple[Job, Client]:
14 |     hq_env.start_server()
15 |     if with_worker:
16 |         hq_env.start_worker()
17 |     client = Client(hq_env.server_dir)
18 |     return (Job(**job_args), client)
19 | 
20 | 
21 | def hq_env_from_cluster(cluster: LocalCluster) -> HqEnv:
22 |     server_dir = cluster.cluster.server_dir
23 |     hq_env = HqEnv(server_dir, ProgramMock(os.path.join(server_dir, "mock")))
24 |     hq_env.server_dir = server_dir
25 |     return hq_env
26 | 


--------------------------------------------------------------------------------
/tests/pyapi/binding/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It4innovations/hyperqueue/aecde5c53dd41843cc566d87e6fe8036ba26a8fa/tests/pyapi/binding/__init__.py


--------------------------------------------------------------------------------
/tests/pyapi/binding/test_server.py:
--------------------------------------------------------------------------------
 1 | from hyperqueue.ffi.client import ClientConnection
 2 | 
 3 | from ...conftest import HqEnv
 4 | 
 5 | 
 6 | def test_stop_server(hq_env: HqEnv):
 7 |     process = hq_env.start_server()
 8 |     connection = ClientConnection(hq_env.server_dir)
 9 |     connection.stop_server()
10 |     process.wait(timeout=5)
11 |     hq_env.check_process_exited(process)
12 | 


--------------------------------------------------------------------------------
/tests/pyapi/test_cluster.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from hyperqueue.cluster import LocalCluster, WorkerConfig
 3 | from hyperqueue.job import Job
 4 | 
 5 | from ..utils import wait_for_worker_state
 6 | from ..utils.io import check_file_contents
 7 | from . import hq_env_from_cluster
 8 | 
 9 | 
10 | def test_cluster_create():
11 |     with LocalCluster():
12 |         pass
13 | 
14 | 
15 | def test_cluster_stop_twice():
16 |     cluster = LocalCluster()
17 |     cluster.stop()
18 | 
19 |     with pytest.raises(BaseException):
20 |         cluster.stop()
21 | 
22 | 
23 | def test_cluster_create_client():
24 |     with LocalCluster() as cluster:
25 |         client = cluster.client()
26 |         job = Job()
27 |         job.program(["uname"])
28 |         client.submit(job)
29 | 
30 | 
31 | def test_cluster_add_worker():
32 |     with LocalCluster() as cluster:
33 |         cluster.start_worker()
34 |         client = cluster.client()
35 |         job = Job()
36 |         job.program(["echo", "hello"], stdout="out.txt")
37 |         job_id = client.submit(job)
38 |         client.wait_for_jobs([job_id])
39 | 
40 |         check_file_contents("out.txt", "hello\n")
41 | 
42 | 
43 | def test_cluster_worker_cores():
44 |     with LocalCluster() as cluster:
45 |         cluster.start_worker(WorkerConfig(cores=4))
46 |         hq_env = hq_env_from_cluster(cluster)
47 |         wait_for_worker_state(hq_env, 1, "RUNNING")
48 | 
49 |         table = hq_env.command(["worker", "list"], as_table=True)
50 |         table.check_columns_value(["Resources"], 0, ["cpus 4"])
51 | 


--------------------------------------------------------------------------------
/tests/pyapi/test_visualization.py:
--------------------------------------------------------------------------------
 1 | from tempfile import NamedTemporaryFile
 2 | 
 3 | from hyperqueue.job import Job
 4 | from hyperqueue.visualization import visualize_job
 5 | 
 6 | from ..utils.io import check_file_contents
 7 | 
 8 | 
 9 | def test_visualization():
10 |     def fn():
11 |         pass
12 | 
13 |     job = Job()
14 |     a = job.function(fn, name="a")
15 |     b1 = job.function(fn, name="b1", deps=[a])
16 |     b2 = job.function(fn, name="b2", deps=[a])
17 |     c1 = job.function(fn, name="c1", deps=[b1])
18 |     c2 = job.function(fn, name="c2", deps=[b2])
19 |     job.function(fn, name="d", deps=[c1, c2])
20 | 
21 |     with NamedTemporaryFile() as f:
22 |         visualize_job(job, f.name)
23 |         check_file_contents(
24 |             f.name,
25 |             """digraph job {
26 | a;
27 | b1;
28 | a -> b1;
29 | b2;
30 | a -> b2;
31 | c1;
32 | b1 -> c1;
33 | c2;
34 | b2 -> c2;
35 | d;
36 | c1 -> d;
37 | c2 -> d;
38 | }
39 | """,
40 |         )
41 | 


--------------------------------------------------------------------------------
/tests/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | # This is for assertion replacement, do not remove this line!
3 | python_files = utils/*.py test_*.py
4 | addopts = --strict-markers
5 | markers =
6 |     pbs: run test only if PBS is available
7 |     slurm: run test only if SLURM is available
8 | 


--------------------------------------------------------------------------------
/tests/requirements.txt:
--------------------------------------------------------------------------------
 1 | pytest==8.3.5
 2 | pytest-xdist==2.5.0
 3 | iso8601==1.0.2
 4 | schema==0.7.5
 5 | maturin==1.8.6
 6 | psutil==5.8.0
 7 | requests==2.32.3
 8 | aiohttp==3.10.11
 9 | inline-snapshot==0.2.1
10 | ruff==0.11.12
11 | 


--------------------------------------------------------------------------------
/tests/test_time.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from .conftest import HqEnv
 4 | from .utils import wait_for_job_state
 5 | from .utils.job import list_jobs
 6 | 
 7 | 
 8 | def test_job_time_request1(hq_env: HqEnv):
 9 |     # Tests that tasks are sent only to worker3 and worker 4 (because of time requests)
10 |     hq_env.start_server()
11 |     hq_env.start_worker(args=["--time-limit", "2s"])
12 |     hq_env.start_worker(args=["--time-limit", "4s"])
13 |     hq_env.start_worker(args=["--time-limit", "10s"])
14 |     hq_env.start_worker()
15 | 
16 |     hq_env.command(["submit", "--array=1-20", "--time-request=5s", "--", "ls"])
17 |     wait_for_job_state(hq_env, 1, "FINISHED")
18 |     table = hq_env.command(["job", "info", "1"], as_table=True)
19 |     assert {"worker3", "worker4"} == set(table.get_row_value("Workers").split(","))
20 | 
21 | 
22 | def test_job_time_request2(hq_env: HqEnv):
23 |     # Test that a tasks with time request is not sent to worker without remaining lifetime
24 |     hq_env.start_server()
25 |     hq_env.start_worker(args=["--time-limit", "4s"])
26 |     hq_env.command(["submit", "--time-request=2s", "--", "ls"])
27 |     time.sleep(2.2)
28 |     hq_env.command(["submit", "--time-request=2s", "--", "ls"])
29 |     time.sleep(1.0)
30 |     # hq_env.start_worker(args=["--time-limit", "5s"])
31 | 
32 |     wait_for_job_state(hq_env, 1, "FINISHED")
33 |     table = list_jobs(hq_env)
34 |     assert table.get_column_value("State")[0] == "FINISHED"
35 |     assert table.get_column_value("State")[1] == "WAITING"
36 | 


--------------------------------------------------------------------------------
/tests/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .table import JOB_TABLE_ROWS, parse_table, parse_tables
 2 | from .wait import wait_for_job_state, wait_for_worker_state, wait_for_task_state
 3 | 
 4 | __all__ = [
 5 |     "wait_for_job_state",
 6 |     "wait_for_worker_state",
 7 |     "wait_for_task_state",
 8 |     "parse_table",
 9 |     "parse_tables",
10 |     "JOB_TABLE_ROWS",
11 | ]
12 | 


--------------------------------------------------------------------------------
/tests/utils/cmd.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | 
 4 | def python(command: str) -> List[str]:
 5 |     """
 6 |     Returns commands that will run the specified command as a Python script.
 7 |     """
 8 |     return ["python3", "-c", command]
 9 | 
10 | 
11 | def bash(command: str) -> List[str]:
12 |     """
13 |     Returns commands that will run in a bash script.
14 |     """
15 |     return ["bash", "-c", command]
16 | 


--------------------------------------------------------------------------------
/tests/utils/io.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | from contextlib import closing
 3 | 
 4 | from . import wait_for_job_state
 5 | from .job import default_task_output
 6 | from .wait import wait_until
 7 | from ..conftest import HqEnv
 8 | 
 9 | 
10 | def check_file_contents(path: str, content):
11 |     assert read_file(path) == str(content)
12 | 
13 | 
14 | def read_file(path: str) -> str:
15 |     with open(path) as f:
16 |         return f.read()
17 | 
18 | 
19 | def write_file(path: str, content: str):
20 |     with open(path, "w") as f:
21 |         f.write(content)
22 | 
23 | 
24 | def create_file(path: str):
25 |     write_file(path, "")
26 | 
27 | 
28 | def find_free_port():
29 |     with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
30 |         s.bind(("", 0))
31 |         s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
32 |         return s.getsockname()[1]
33 | 
34 | 
35 | def read_task_pid(hq_env: HqEnv, job_id: int = 1) -> int:
36 |     """
37 |     Reads the task's PID from its stdout.
38 |     """
39 |     wait_for_job_state(hq_env, job_id, "RUNNING")
40 | 
41 |     def get_pid():
42 |         pid = read_file(default_task_output(job_id=job_id)).strip()
43 |         if not pid:
44 |             return None
45 |         return int(pid)
46 | 
47 |     return wait_until(get_pid)
48 | 


--------------------------------------------------------------------------------
/tests/utils/job.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List, Optional
 3 | 
 4 | from ..conftest import HqEnv
 5 | from .table import Table
 6 | 
 7 | 
 8 | def default_task_output(job_id=1, task_id=0, type="stdout", working_dir: Optional[str] = None) -> str:
 9 |     working_dir = working_dir if working_dir else os.getcwd()
10 |     return f"{working_dir}/job-{job_id}/{task_id}.{type}"
11 | 
12 | 
13 | def list_jobs(hq_env: HqEnv, all=True, filters: List[str] = None) -> Table:
14 |     args = ["job", "list"]
15 |     if all:
16 |         assert filters is None
17 |         args.append("--all")
18 |     elif filters:
19 |         args.extend(["--filter", ",".join(filters)])
20 | 
21 |     return hq_env.command(args, as_table=True)
22 | 


--------------------------------------------------------------------------------
/tests/utils/mock.py:
--------------------------------------------------------------------------------
 1 | import contextlib
 2 | import os
 3 | import sys
 4 | from pathlib import Path
 5 | from typing import Dict
 6 | 
 7 | 
 8 | class ProgramMock:
 9 |     def __init__(self, directory: str):
10 |         self.directory = Path(os.path.abspath(directory))
11 |         os.makedirs(self.directory, exist_ok=True)
12 | 
13 |     def update_env(self, env: Dict[str, str]):
14 |         path = str(self.directory)
15 |         if "PATH" in env:
16 |             path += f":{env['PATH']}"
17 |         env["PATH"] = path
18 | 
19 |     def redirect_program_to_binary(self, mocked_program: str, target_binary: Path):
20 |         """
21 |         Mocks `mocked_program` so that when you try to execute it, `target_binary` will be executed
22 |         instead.
23 |         """
24 |         link_path = self.directory / mocked_program
25 |         link_path.unlink(missing_ok=True)
26 |         os.symlink(dst=link_path, src=target_binary)
27 |         # Make the link executable
28 |         os.chmod(link_path, 0o700)
29 | 
30 |     @contextlib.contextmanager
31 |     def mock_program_with_code(self, name: str, code: str):
32 |         import textwrap
33 | 
34 |         content = f"#!{sys.executable}\n{textwrap.dedent(code)}"
35 |         program_path = self.directory / name
36 |         assert not program_path.is_file()
37 | 
38 |         with open(program_path, "w") as f:
39 |             f.write(content)
40 |         os.chmod(program_path, 0o700)
41 | 
42 |         yield program_path
43 | 
44 |         os.unlink(program_path)
45 | 


--------------------------------------------------------------------------------