├── .dockerignore
├── .flake8
├── .github
    └── workflows
    │   ├── build-cpu.yml
    │   ├── build-cuda.yml
    │   ├── ci.yml
    │   ├── test-cpu.yml
    │   ├── test-cuda.yml
    │   └── wheels.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Cargo.toml
├── Dockerfile
├── LICENSE
├── README.md
├── books
    └── hyperactor-book
    │   ├── .gitignore
    │   ├── README.md
    │   ├── book.toml
    │   └── src
    │       ├── SUMMARY.md
    │       ├── actors
    │           ├── actor.md
    │           ├── actor_handle.md
    │           ├── actor_lifecycle.md
    │           ├── binds.md
    │           ├── checkpointable.md
    │           ├── handler.md
    │           ├── index.md
    │           ├── remotable_actor.md
    │           ├── remote_actor.md
    │           └── remote_handles.md
    │       ├── introduction.md
    │       ├── macros
    │           ├── export.md
    │           ├── forward.md
    │           ├── handle_client.md
    │           ├── handler.md
    │           ├── index.md
    │           ├── named.md
    │           └── ref_client.md
    │       ├── mailboxes
    │           ├── delivery.md
    │           ├── index.md
    │           ├── mailbox.md
    │           ├── mailbox_client.md
    │           ├── mailbox_sender.md
    │           ├── mailbox_server.md
    │           ├── multiplexer.md
    │           ├── ports.md
    │           ├── reconfigurable_sender.md
    │           └── routers.md
    │       └── references
    │           ├── actor_id.md
    │           ├── bindings.md
    │           ├── gang_id.md
    │           ├── gangs.md
    │           ├── index.md
    │           ├── port_id.md
    │           ├── proc_id.md
    │           ├── reference.md
    │           ├── syntax.md
    │           ├── typed_refs.md
    │           └── world_id.md
├── build-requirements.txt
├── clippy.toml
├── controller
    ├── Cargo.toml
    ├── build.rs
    └── src
    │   ├── bootstrap.rs
    │   ├── history.rs
    │   ├── lib.rs
    │   └── main.rs
├── cuda-sys
    ├── Cargo.toml
    ├── build.rs
    └── src
    │   ├── lib.rs
    │   └── wrapper.h
├── examples
    ├── __init__.py
    ├── grpo_actor.py
    └── notebooks
    │   ├── README.md
    │   ├── ping_pong.ipynb
    │   └── spmd_ddp.ipynb
├── hyper
    ├── Cargo.toml
    ├── src
    │   ├── commands.rs
    │   ├── commands
    │   │   ├── demo.rs
    │   │   ├── procs.rs
    │   │   ├── serve.rs
    │   │   ├── show.rs
    │   │   └── top.rs
    │   ├── lib.rs
    │   ├── main.rs
    │   ├── tui
    │   │   ├── mod.rs
    │   │   └── top.rs
    │   └── utils
    │   │   ├── mod.rs
    │   │   └── system_address.rs
    └── tests
    │   └── demo_test.py
├── hyperactor
    ├── Cargo.toml
    ├── example
    │   └── derive.rs
    └── src
    │   ├── accum.rs
    │   ├── actor.rs
    │   ├── actor
    │       └── remote.rs
    │   ├── attrs.rs
    │   ├── cap.rs
    │   ├── channel.rs
    │   ├── channel
    │       ├── local.rs
    │       ├── net.rs
    │       └── sim.rs
    │   ├── checkpoint.rs
    │   ├── clock.rs
    │   ├── config.rs
    │   ├── data.rs
    │   ├── init.rs
    │   ├── lib.rs
    │   ├── mailbox.rs
    │   ├── mailbox
    │       ├── durable_mailbox_sender.rs
    │       ├── mailbox_admin_message.rs
    │       └── undeliverable.rs
    │   ├── message.rs
    │   ├── metrics.rs
    │   ├── panic_handler.rs
    │   ├── parse.rs
    │   ├── proc.rs
    │   ├── reference.rs
    │   ├── simnet.rs
    │   ├── spawn.rs
    │   ├── supervision.rs
    │   ├── sync.rs
    │   ├── sync
    │       ├── flag.rs
    │       └── monitor.rs
    │   ├── test_utils.rs
    │   └── test_utils
    │       ├── pingpong.rs
    │       ├── proc_supervison.rs
    │       └── process_assertion.rs
├── hyperactor_macros
    ├── Cargo.toml
    ├── build.rs
    ├── src
    │   └── lib.rs
    └── tests
    │   ├── basic.rs
    │   ├── castable.rs
    │   └── export.rs
├── hyperactor_mesh
    ├── Cargo.toml
    ├── examples
    │   ├── dining_philosophers.rs
    │   └── sieve.rs
    ├── src
    │   ├── actor_mesh.rs
    │   ├── alloc.rs
    │   ├── alloc
    │   │   ├── local.rs
    │   │   ├── logtailer.rs
    │   │   ├── process.rs
    │   │   ├── remoteprocess.rs
    │   │   └── sim.rs
    │   ├── assign.rs
    │   ├── bootstrap.rs
    │   ├── comm.rs
    │   ├── comm
    │   │   └── multicast.rs
    │   ├── connect.rs
    │   ├── lib.rs
    │   ├── logging.rs
    │   ├── mesh.rs
    │   ├── mesh_selection.rs
    │   ├── metrics.rs
    │   ├── proc_mesh.rs
    │   ├── proc_mesh
    │   │   └── mesh_agent.rs
    │   ├── reference.rs
    │   ├── shared_cell.rs
    │   ├── shortuuid.rs
    │   └── test_utils.rs
    └── test
    │   ├── bootstrap.rs
    │   └── process_allocator_cleanup
    │       ├── process_allocator_cleanup.rs
    │       ├── process_allocator_test_bin.rs
    │       └── process_allocator_test_bootstrap.rs
├── hyperactor_mesh_macros
    ├── Cargo.toml
    └── src
    │   └── lib.rs
├── hyperactor_multiprocess
    ├── Cargo.toml
    └── src
    │   ├── lib.rs
    │   ├── ping_pong.rs
    │   ├── proc_actor.rs
    │   ├── pyspy.rs
    │   ├── scheduler.rs
    │   ├── supervision.rs
    │   ├── system.rs
    │   └── system_actor.rs
├── hyperactor_telemetry
    ├── Cargo.toml
    ├── src
    │   ├── lib.rs
    │   ├── otel.rs
    │   ├── pool.rs
    │   ├── recorder.rs
    │   └── spool.rs
    ├── stubs
    │   ├── fbinit
    │   │   └── src
    │   │   │   └── lib.rs
    │   └── scuba
    │   │   └── src
    │   │       └── lib.rs
    └── tester
    │   ├── Cargo.toml
    │   └── main.rs
├── monarch_extension
    ├── Cargo.toml
    ├── build.rs
    └── src
    │   ├── blocking.rs
    │   ├── client.rs
    │   ├── code_sync.rs
    │   ├── controller.rs
    │   ├── convert.rs
    │   ├── debugger.rs
    │   ├── lib.rs
    │   ├── logging.rs
    │   ├── mesh_controller.rs
    │   ├── panic.rs
    │   ├── simulation_tools.rs
    │   ├── simulator_client.rs
    │   └── tensor_worker.rs
├── monarch_hyperactor
    ├── Cargo.toml
    └── src
    │   ├── actor.rs
    │   ├── actor_mesh.rs
    │   ├── alloc.rs
    │   ├── bin
    │       └── process_allocator
    │       │   ├── common.rs
    │       │   └── main.rs
    │   ├── bootstrap.rs
    │   ├── channel.rs
    │   ├── code_sync.rs
    │   ├── code_sync
    │       ├── manager.rs
    │       ├── rsync.rs
    │       └── workspace.rs
    │   ├── config.rs
    │   ├── lib.rs
    │   ├── local_state_broker.rs
    │   ├── mailbox.rs
    │   ├── ndslice.rs
    │   ├── proc.rs
    │   ├── proc_mesh.rs
    │   ├── runtime.rs
    │   ├── selection.rs
    │   ├── shape.rs
    │   ├── supervision.rs
    │   └── telemetry.rs
├── monarch_messages
    ├── Cargo.toml
    ├── build.rs
    ├── src
    │   ├── client.rs
    │   ├── controller.rs
    │   ├── debugger.rs
    │   ├── lib.rs
    │   ├── wire_value.rs
    │   └── worker.rs
    └── test_utils.py
├── monarch_rdma
    ├── Cargo.toml
    ├── examples
    │   ├── Cargo.toml
    │   ├── bootstrap.rs
    │   ├── main.rs
    │   └── parameter_server.rs
    ├── extension
    │   ├── Cargo.toml
    │   └── lib.rs
    └── src
    │   ├── ibverbs_primitives.rs
    │   ├── lib.rs
    │   ├── macros.rs
    │   ├── rdma_components.rs
    │   ├── rdma_manager_actor.rs
    │   └── test_utils.rs
├── monarch_simulator
    ├── Cargo.toml
    └── src
    │   ├── bootstrap.rs
    │   ├── collective_coordinator.rs
    │   ├── controller.rs
    │   ├── lib.rs
    │   ├── simulator.rs
    │   └── worker.rs
├── monarch_tensor_worker
    ├── Cargo.toml
    ├── build.rs
    ├── src
    │   ├── bootstrap.rs
    │   ├── borrow.rs
    │   ├── comm.rs
    │   ├── device_mesh.rs
    │   ├── lib.rs
    │   ├── pipe.rs
    │   ├── py_pipe.rs
    │   ├── stream.rs
    │   └── test_util.rs
    ├── test_utils.py
    └── test_worker_main.py
├── monarch_types
    ├── Cargo.toml
    └── src
    │   ├── lib.rs
    │   ├── pyobject.rs
    │   ├── python.rs
    │   └── pytree.rs
├── nccl-sys
    ├── Cargo.toml
    ├── build.rs
    └── src
    │   ├── lib.rs
    │   └── nccl.h
├── ndslice
    ├── Cargo.toml
    └── src
    │   ├── lib.rs
    │   ├── reshape.rs
    │   ├── selection.rs
    │   ├── selection
    │       ├── normal.rs
    │       ├── parse.rs
    │       ├── pretty.rs
    │       ├── routing.rs
    │       ├── test_utils.rs
    │       └── token_parser.rs
    │   ├── shape.rs
    │   ├── slice.rs
    │   ├── strategy.rs
    │   └── utils.rs
├── preempt_rwlock
    ├── Cargo.toml
    └── src
    │   └── lib.rs
├── pyproject.toml
├── python
    ├── monarch
    │   ├── __init__.py
    │   ├── _rust_bindings
    │   │   ├── __init__.pyi
    │   │   ├── controller
    │   │   │   └── bootstrap.pyi
    │   │   ├── monarch_extension
    │   │   │   ├── __init__.pyi
    │   │   │   ├── blocking.pyi
    │   │   │   ├── client.pyi
    │   │   │   ├── code_sync.pyi
    │   │   │   ├── controller.pyi
    │   │   │   ├── debugger.pyi
    │   │   │   ├── logging.pyi
    │   │   │   ├── mesh_controller.pyi
    │   │   │   ├── panic.pyi
    │   │   │   ├── simulation_tools.pyi
    │   │   │   ├── simulator_client.pyi
    │   │   │   └── tensor_worker.pyi
    │   │   ├── monarch_hyperactor
    │   │   │   ├── actor.pyi
    │   │   │   ├── actor_mesh.pyi
    │   │   │   ├── alloc.pyi
    │   │   │   ├── bootstrap.pyi
    │   │   │   ├── channel.pyi
    │   │   │   ├── mailbox.pyi
    │   │   │   ├── proc.pyi
    │   │   │   ├── proc_mesh.pyi
    │   │   │   ├── runtime.pyi
    │   │   │   ├── selection.pyi
    │   │   │   ├── shape.pyi
    │   │   │   ├── supervision.pyi
    │   │   │   └── telemetry.pyi
    │   │   ├── monarch_messages
    │   │   │   └── debugger.pyi
    │   │   ├── monarch_tensor_worker
    │   │   │   └── bootstrap.pyi
    │   │   ├── old.pyi
    │   │   └── rdma
    │   │   │   └── __init__.pyi
    │   ├── _src
    │   │   ├── __init__.py
    │   │   ├── actor
    │   │   │   ├── __init__.py
    │   │   │   ├── actor_mesh.py
    │   │   │   ├── allocator.py
    │   │   │   ├── bootstrap_main.py
    │   │   │   ├── code_sync
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── auto_reload.py
    │   │   │   ├── debugger.py
    │   │   │   ├── device_utils.py
    │   │   │   ├── future.py
    │   │   │   ├── pdb_wrapper.py
    │   │   │   ├── pickle.py
    │   │   │   ├── proc_mesh.py
    │   │   │   ├── shape.py
    │   │   │   └── telemetry
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── rust_span_tracing.py
    │   │   └── tensor_engine
    │   │   │   ├── __init__.py
    │   │   │   └── rdma.py
    │   ├── _testing.py
    │   ├── actor
    │   │   └── __init__.py
    │   ├── actor_mesh.py
    │   ├── bootstrap_main.py
    │   ├── builtins
    │   │   ├── __init__.py
    │   │   ├── log.py
    │   │   └── random.py
    │   ├── cached_remote_function.py
    │   ├── common
    │   │   ├── _C.pyi
    │   │   ├── __init__.py
    │   │   ├── _coalescing.py
    │   │   ├── _tensor_to_table.py
    │   │   ├── base_tensor.py
    │   │   ├── borrows.py
    │   │   ├── client.py
    │   │   ├── constants.py
    │   │   ├── context_manager.py
    │   │   ├── controller_api.py
    │   │   ├── device_mesh.py
    │   │   ├── fake.py
    │   │   ├── function.py
    │   │   ├── function_caching.py
    │   │   ├── future.py
    │   │   ├── init.cpp
    │   │   ├── invocation.py
    │   │   ├── mast.py
    │   │   ├── messages.py
    │   │   ├── mock_cuda.cpp
    │   │   ├── mock_cuda.h
    │   │   ├── mock_cuda.py
    │   │   ├── opaque_ref.py
    │   │   ├── pipe.py
    │   │   ├── process_group.py
    │   │   ├── recording.py
    │   │   ├── reference.py
    │   │   ├── remote.py
    │   │   ├── selection.py
    │   │   ├── stream.py
    │   │   ├── tensor.py
    │   │   ├── tensor_factory.py
    │   │   └── tree.py
    │   ├── controller
    │   │   ├── __init__.py
    │   │   ├── backend.py
    │   │   ├── controller.py
    │   │   ├── debugger.py
    │   │   ├── history.py
    │   │   └── rust_backend
    │   │   │   ├── __init__.py
    │   │   │   └── controller.py
    │   ├── fetch.py
    │   ├── gradient
    │   │   ├── __init__.py
    │   │   ├── _gradient_generator.cpp
    │   │   └── _gradient_generator.pyi
    │   ├── gradient_generator.py
    │   ├── memory.py
    │   ├── mesh_controller.py
    │   ├── notebook.py
    │   ├── opaque_module.py
    │   ├── opaque_object.py
    │   ├── parallel
    │   │   ├── __init__.py
    │   │   └── pipelining
    │   │   │   ├── __init__.py
    │   │   │   ├── runtime.py
    │   │   │   ├── schedule_ir.py
    │   │   │   └── scheduler.py
    │   ├── proc_mesh.py
    │   ├── profiler.py
    │   ├── python_local_mesh.py
    │   ├── random.py
    │   ├── rdma.py
    │   ├── remote_class.py
    │   ├── rust_backend_mesh.py
    │   ├── rust_local_mesh.py
    │   ├── sim_mesh.py
    │   ├── simulator
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── command_history.py
    │   │   ├── config.py
    │   │   ├── interface.py
    │   │   ├── ir.py
    │   │   ├── mock_controller.py
    │   │   ├── profiling.py
    │   │   ├── simulator.py
    │   │   ├── task.py
    │   │   ├── tensor.py
    │   │   ├── trace.py
    │   │   ├── utils.py
    │   │   └── worker.py
    │   ├── tensor_worker_main.py
    │   ├── tensorboard.py
    │   ├── timer
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── example_monarch.py
    │   │   ├── example_spmd.py
    │   │   ├── execution_timer.py
    │   │   └── execution_timer_test.py
    │   ├── tools
    │   │   ├── __init__.py
    │   │   ├── cli.py
    │   │   ├── commands.py
    │   │   ├── components
    │   │   │   ├── __init__.py
    │   │   │   └── hyperactor.py
    │   │   ├── config
    │   │   │   ├── __init__.py
    │   │   │   └── defaults.py
    │   │   ├── mesh_spec.py
    │   │   ├── network.py
    │   │   └── utils.py
    │   ├── worker
    │   │   ├── __init__.py
    │   │   ├── _testing_function.py
    │   │   ├── compiled_block.py
    │   │   ├── debugger.py
    │   │   ├── lines.py
    │   │   ├── monitor.py
    │   │   └── worker.py
    │   └── world_mesh.py
    ├── monarch_supervisor
    │   ├── README.md
    │   ├── __init__.py
    │   ├── _testing.py
    │   ├── diagram.png
    │   ├── function_call.py
    │   ├── host.py
    │   ├── launchers.py
    │   ├── log_pstree.py
    │   ├── logging.py
    │   ├── python_executable.py
    │   └── worker
    │   │   └── worker_env.py
    └── tests
    │   ├── __init__.py
    │   ├── _monarch
    │       ├── test_actor.py
    │       ├── test_actor_mesh.py
    │       ├── test_client.py
    │       ├── test_controller.py
    │       ├── test_hyperactor.py
    │       ├── test_mailbox.py
    │       ├── test_ndslice.py
    │       └── test_worker.py
    │   ├── builtins
    │       ├── test_log.py
    │       └── test_random.py
    │   ├── code_sync
    │       └── test_auto_reload.py
    │   ├── dispatch_bench.py
    │   ├── dispatch_bench_helper.py
    │   ├── error_test_binary.py
    │   ├── requirements.txt
    │   ├── simulator
    │       ├── __init__.py
    │       ├── test_profiling.py
    │       ├── test_simulator.py
    │       ├── test_task.py
    │       └── test_worker.py
    │   ├── sleep_binary.py
    │   ├── test_actor_error.py
    │   ├── test_alloc.py
    │   ├── test_allocator.py
    │   ├── test_coalescing.py
    │   ├── test_controller.py
    │   ├── test_debugger.py
    │   ├── test_device_mesh.py
    │   ├── test_fault_tolerance.py
    │   ├── test_future.py
    │   ├── test_grad_generator.py
    │   ├── test_mock_cuda.py
    │   ├── test_pdb_actor.py
    │   ├── test_python_actors.py
    │   ├── test_remote_functions.py
    │   ├── test_rust_backend.py
    │   ├── test_signal_safe_block_on.py
    │   ├── test_sim_backend.py
    │   ├── test_tensor_engine.py
    │   └── tools
    │       ├── config
    │           └── test_defaults.py
    │       ├── test_cli.py
    │       ├── test_commands.py
    │       ├── test_mesh_spec.py
    │       ├── test_network.py
    │       ├── test_utils.py
    │       └── utils.py
├── rdmacore-sys
    ├── Cargo.toml
    ├── build.rs
    └── src
    │   ├── lib.rs
    │   └── wrapper.h
├── requirements.txt
├── rust-toolchain
├── rustfmt.toml
├── scripts
    └── common-setup.sh
├── setup.py
├── timed_test
    ├── Cargo.toml
    ├── src
    │   └── lib.rs
    └── tests
    │   └── basic.rs
├── tools
    └── rust
    │   └── ossconfigs
    │       └── clippy.toml
├── torch-sys-cuda
    ├── Cargo.toml
    ├── build.rs
    └── src
    │   ├── bridge.cpp
    │   ├── bridge.h
    │   ├── bridge.rs
    │   ├── cuda.rs
    │   ├── lib.rs
    │   └── nccl.rs
└── torch-sys
    ├── Cargo.toml
    ├── README.md
    ├── build.rs
    └── src
        ├── backend.rs
        ├── bindings.rs
        ├── borrow.rs
        ├── bridge.cpp
        ├── bridge.h
        ├── bridge.rs
        ├── call_op.rs
        ├── cell.rs
        ├── device.rs
        ├── ivalue.rs
        ├── layout.rs
        ├── lib.rs
        ├── memory_format.rs
        ├── pyobject.rs
        ├── rvalue.rs
        ├── scalar_type.rs
        ├── tensor.rs
        └── torch.hpp


/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 | .github
3 | .pyre
4 | docs
5 | *_meta/**
6 | **/*_meta/**
7 | **/*_meta.rs
8 | **/meta/**
9 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 256
3 | extend-ignore = E302, G004, SIM105, G201, SIM115, SIM904
4 | 


--------------------------------------------------------------------------------
/.github/workflows/build-cpu.yml:
--------------------------------------------------------------------------------
 1 | name: Build CPU
 2 | 
 3 | on:
 4 |   workflow_call:
 5 | 
 6 | concurrency:
 7 |   group: build-cpu-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
 8 |   cancel-in-progress: true
 9 | 
10 | jobs:
11 |   build-cpu:
12 |     name: Build CPU - No Tensor Engine
13 |     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
14 |     strategy:
15 |       fail-fast: true
16 |       matrix:
17 |         include:
18 |           - name: 4xlarge
19 |             runs-on: linux.4xlarge
20 |     with:
21 |       timeout: 60
22 |       runner: ${{ matrix.runs-on }}
23 |       submodules: recursive
24 |       upload-artifact: monarch-cpu-${{ github.sha }}
25 |       script: |
26 |         # Source common setup functions
27 |         source scripts/common-setup.sh
28 | 
29 |         # Setup build environment (conda + system deps + rust + build deps)
30 |         setup_build_environment
31 | 
32 |         # Build monarch (No tensor engine, CPU version)
33 |         USE_TENSOR_ENGINE=0 python setup.py bdist_wheel
34 | 


--------------------------------------------------------------------------------
/.github/workflows/build-cuda.yml:
--------------------------------------------------------------------------------
 1 | name: Build CUDA
 2 | 
 3 | on:
 4 |   workflow_call:
 5 | 
 6 | concurrency:
 7 |   group: build-cuda-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
 8 |   cancel-in-progress: true
 9 | 
10 | jobs:
11 |   build-cuda:
12 |     name: Build CUDA (cuda12.6-py3.10)
13 |     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
14 |     strategy:
15 |       fail-fast: true
16 |       matrix:
17 |         include:
18 |           - name: 4xlargegpu
19 |             runs-on: linux.g5.4xlarge.nvidia.gpu
20 |             torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
21 |             gpu-arch-type: "cuda"
22 |             gpu-arch-version: "12.6"
23 |     with:
24 |       timeout: 60
25 |       runner: ${{ matrix.runs-on }}
26 |       gpu-arch-type: ${{ matrix.gpu-arch-type }}
27 |       gpu-arch-version: ${{ matrix.gpu-arch-version }}
28 |       submodules: recursive
29 |       upload-artifact: monarch-cuda-${{ github.sha }}
30 |       script: |
31 |         # Source common setup functions
32 |         source scripts/common-setup.sh
33 | 
34 |         # Setup build environment (conda + system deps + rust + build deps)
35 |         setup_build_environment
36 | 
37 |         # Setup Tensor Engine
38 |         setup_tensor_engine
39 | 
40 |         # Build the process allocator binary
41 |         build_process_allocator
42 | 
43 |         # Build monarch (CUDA version)
44 |         python setup.py bdist_wheel
45 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - main
 7 |       - gh/**
 8 |   push:
 9 |     branches:
10 |       - main
11 | 
12 | concurrency:
13 |   group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
14 |   cancel-in-progress: true
15 | 
16 | jobs:
17 |   build-cuda:
18 |     name: Build CUDA
19 |     uses: ./.github/workflows/build-cuda.yml
20 | 
21 |   build-cpu:
22 |     name: Build CPU
23 |     uses: ./.github/workflows/build-cpu.yml
24 | 
25 |   test-cuda:
26 |     name: Test CUDA
27 |     needs: build-cuda
28 |     uses: ./.github/workflows/test-cuda.yml
29 |     with:
30 |       artifact-name: monarch-cuda-${{ github.sha }}
31 | 
32 |   test-cpu:
33 |     name: Test CPU
34 |     needs: build-cpu
35 |     uses: ./.github/workflows/test-cpu.yml
36 |     with:
37 |       artifact-name: monarch-cpu-${{ github.sha }}
38 | 
39 |   status-check:
40 |     name: Status Check
41 |     runs-on: ubuntu-latest
42 |     needs: [test-cuda, test-cpu]
43 |     if: always()
44 |     steps:
45 |       - name: Check all jobs status
46 |         run: |
47 |           if [[ "${{ needs.test-cuda.result }}" != "success" ]] ||
48 |              [[ "${{ needs.test-cpu.result }}" != "success" ]]; then
49 |             echo "One or more jobs failed"
50 |             exit 1
51 |           else
52 |             echo "All jobs passed"
53 |           fi
54 | 


--------------------------------------------------------------------------------
/.github/workflows/test-cpu.yml:
--------------------------------------------------------------------------------
 1 | name: Test CPU
 2 | 
 3 | on:
 4 |   workflow_call:
 5 |     inputs:
 6 |       artifact-name:
 7 |         description: 'Wheel artifact name from build workflow'
 8 |         required: true
 9 |         type: string
10 | 
11 | concurrency:
12 |   group: test-cpu-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
13 |   cancel-in-progress: true
14 | 
15 | jobs:
16 |   test-cpu-no-tensor-engine:
17 |     name: Test CPU - No Tensor Engine
18 |     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
19 |     with:
20 |       timeout: 60
21 |       runner: linux.4xlarge
22 |       submodules: recursive
23 |       download-artifact: ${{ inputs.artifact-name }}
24 |       script: |
25 |         # Source common setup functions
26 |         source scripts/common-setup.sh
27 | 
28 |         # Setup test environment
29 |         setup_conda_environment
30 | 
31 |         # Disable tensor engine
32 |         export USE_TENSOR_ENGINE=0
33 | 
34 |         # Install the built wheel from artifact
35 |         install_wheel_from_artifact
36 | 
37 |         # Currently a no-op.
38 |         # Tests requiring tensor engine / GPU need to be identified and flagged to skip.
39 |         # We will just ensure monarch can be imported successfully.
40 |         python -c "import monarch; print('Monarch imported successfully')"
41 | 


--------------------------------------------------------------------------------
/.github/workflows/test-cuda.yml:
--------------------------------------------------------------------------------
 1 | name: Test CUDA
 2 | 
 3 | on:
 4 |   workflow_call:
 5 |     inputs:
 6 |       artifact-name:
 7 |         description: 'Wheel artifact name from build workflow'
 8 |         required: true
 9 |         type: string
10 | 
11 | concurrency:
12 |   group: test-cuda-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
13 |   cancel-in-progress: true
14 | 
15 | jobs:
16 |   test-cuda:
17 |     name: Test CUDA (cuda12.6-py3.10)
18 |     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
19 |     strategy:
20 |       fail-fast: true
21 |       matrix:
22 |         include:
23 |           - name: 4xlargegpu
24 |             runs-on: linux.g5.4xlarge.nvidia.gpu
25 |             torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
26 |             gpu-arch-type: "cuda"
27 |             gpu-arch-version: "12.6"
28 |     with:
29 |       timeout: 120
30 |       runner: ${{ matrix.runs-on }}
31 |       gpu-arch-type: ${{ matrix.gpu-arch-type }}
32 |       gpu-arch-version: ${{ matrix.gpu-arch-version }}
33 |       submodules: recursive
34 |       download-artifact: ${{ inputs.artifact-name }}
35 |       script: |
36 |         # Source common setup functions
37 |         source scripts/common-setup.sh
38 | 
39 |         # Setup test environment
40 |         setup_test_environment
41 | 
42 |         # Install cargo binaries
43 |         mkdir cargo_bin && mv ${RUNNER_ARTIFACT_DIR}/cargo_bin/* cargo_bin
44 |         chmod +x cargo_bin/process_allocator
45 |         export PATH=$(pwd)/cargo_bin:$PATH
46 | 
47 |         # Setup Tensor Engine dependencies
48 |         setup_tensor_engine
49 | 
50 |         # Install the built wheel from artifact
51 |         install_wheel_from_artifact
52 | 
53 |         # tests the type_assert statements in test_python_actor are correct
54 |         # pyre currently does not check these assertions
55 |         pyright python/tests/test_python_actors.py
56 | 
57 |         # Run CUDA tests
58 |         LC_ALL=C pytest python/tests/ -s -v -m "not oss_skip"
59 |         python python/tests/test_mock_cuda.py
60 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | syntax: glob
 2 | 
 3 | python/**/*.so
 4 | python/**/*.json
 5 | python/**/*.html
 6 | python/**/*.pkl
 7 | python/**/__pycache__
 8 | python/monarch.egg-info/*
 9 | *.egg
10 | build/*
11 | dist/*
12 | monarch.egg-info/*
13 | python/monarch/monarch_controller
14 | 
15 | .ipynb_checkpoints
16 | 
17 | # Rust stuff
18 | target/
19 | Cargo.lock
20 | 
21 | # mdbook output
22 | books/hyperactor-book/book/**
23 | 
24 | CLAUDE.md
25 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Meta Open Source Projects
 2 | 
 3 | We want to make contributing to this project as easy and transparent as
 4 | possible.
 5 | 
 6 | ## Pull Requests
 7 | We actively welcome your pull requests.
 8 | 
 9 | Note: pull requests are not imported into the GitHub directory in the usual way. There is an internal Meta repository that is the "source of truth" for the project. The GitHub repository is generated *from* the internal Meta repository. So we don't merge GitHub PRs directly to the GitHub repository -- they must first be imported into internal Meta repository. When Meta employees look at the GitHub PR, there is a special button visible only to them that executes that import. The changes are then automatically reflected from the internal Meta repository back to GitHub. This is why you won't see your PR having being directly merged, but you still see your changes in the repository once it reflects the imported changes.
10 | 
11 | 1. Fork the repo and create your branch from `main`.
12 | 2. If you've added code that should be tested, add tests.
13 | 3. If you've changed APIs, update the documentation.
14 | 4. Ensure the test suite passes.
15 | 5. Make sure your code lints.
16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
17 | 
18 | ## Contributor License Agreement ("CLA")
19 | In order to accept your pull request, we need you to submit a CLA. You only need
20 | to do this once to work on any of Meta's open source projects.
21 | 
22 | Complete your CLA here: <https://code.facebook.com/cla>
23 | 
24 | ## Issues
25 | We use GitHub issues to track public bugs. Please ensure your description is
26 | clear and has sufficient instructions to be able to reproduce the issue.
27 | 
28 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
29 | disclosure of security bugs. In those cases, please go through the process
30 | outlined on that page and do not file a public issue.
31 | 
32 | ## License
33 | By contributing to this project, you agree that your contributions will be licensed
34 | under the LICENSE file in the root directory of this source tree.
35 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | resolver = "2"
 3 | members = [
 4 |     "controller",
 5 |     "cuda-sys",
 6 |     "hyper",
 7 |     "hyperactor",
 8 |     "hyperactor_macros",
 9 |     "hyperactor_multiprocess",
10 |     "hyperactor_mesh",
11 |     "hyperactor_mesh_macros",
12 |     "ndslice",
13 |     "monarch_extension",
14 |     "monarch_tensor_worker",
15 |     "monarch_rdma",
16 |     "nccl-sys",
17 |     "rdmacore-sys",
18 |     "torch-sys",
19 |     "rdmacore-sys",
20 |     "cuda-sys",
21 | ]
22 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Pre-reqs:
 2 | #  1. podman (shown below) or just docker
 3 | #     $ dnf install -y podman podman-docker
 4 | #  2. NVIDIA container toolkit
 5 | #     $ dnf install -y nvidia-container-toolkit
 6 | #
 7 | # Build:
 8 | #  $ cd ~/monarch
 9 | #  $ export TAG_NAME=$USER-dev
10 | #  $ docker build --network=host \
11 | #     -t monarch:$TAG_NAME \
12 | #     -f Dockerfile .
13 | #
14 | # Build (with http proxy):
15 | #  $ docker build --network=host \
16 | #     --build-arg=http_proxy=$http_proxy \
17 | #     --build-arg=https_proxy=$https_proxy \
18 | #     -t monarch:$TAG_NAME \
19 | #     -f Dockerfile .
20 | #
21 | ARG http_proxy
22 | ARG https_proxy
23 | 
24 | FROM pytorch/pytorch:2.7.0-cuda12.6-cudnn9-devel
25 | WORKDIR /monarch
26 | 
27 | # export http proxy env vars if build-args are provided
28 | RUN if [ -n "${http_proxy}" ]; then export http_proxy=${http_proxy}; fi && \
29 |     if [ -n "${https_proxy}" ]; then export https_proxy=${https_proxy}; fi
30 | 
31 | # Install native dependencies
32 | RUN apt-get update -y && \
33 |     apt-get -y install curl clang liblzma-dev libunwind-dev
34 | 
35 | # Install Rust
36 | ENV PATH="/root/.cargo/bin:${PATH}"
37 | RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
38 | 
39 | # Install Python deps as a separate layer to avoid rebuilding if deps do not change
40 | COPY requirements.txt .
41 | RUN pip install --no-cache-dir -r requirements.txt
42 | 
43 | # Install monarch
44 | COPY . .
45 | RUN cargo install --path monarch_hyperactor
46 | RUN pip install .
47 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) Meta Platforms, Inc. and affiliates.
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/.gitignore:
--------------------------------------------------------------------------------
1 | book/
2 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/README.md:
--------------------------------------------------------------------------------
 1 | # hyperactor Documentation Book
 2 | 
 3 | This is the development documentation for the hyperactor system, built using [`mdBook`](https://rust-lang.github.io/mdBook/).
 4 | 
 5 | ## Running the Book
 6 | 
 7 | ### On the **Server**
 8 | 
 9 | To run the book on a remote server (e.g., `devgpu004`):
10 | 
11 | ```bash
12 | x2ssh devgpu004.rva5.facebook.com
13 | tmux new -s mdbook
14 | cd ~/fbsource/fbcode/monarch/books/hyperactor-book
15 | mdbook serve
16 | ```
17 | Then detach with Ctrl+b, then d.
18 | 
19 | ### On the **Client**
20 | 
21 | To access the remote book from your local browser:
22 | ```bash
23 | autossh -M 0 -N -L 3000:localhost:3000 devgpu004.rva5.facebook.com
24 | ```
25 | Then open http://localhost:3000 in your browser.
26 | 
27 | **Note**: If you don’t have autossh installed, you can install it with:
28 | ```bash
29 | brew install autossh
30 | ```
31 | 
32 | ### Notes
33 | 
34 | - The source is located in src/, with structure defined in SUMMARY.md.
35 | - The book will auto-reload in the browser on edits.
36 | 
37 | ## Cleaning Up
38 | 
39 | To shut down the book server:
40 | 
41 | ### Option 1: Reattach and stop
42 | 
43 | ```bash
44 | x2ssh devgpu004.rva5.facebook.com
45 | tmux attach -t mdbook
46 | ```
47 | Inside the session:
48 | - Press Ctrl+C to stop mdbook serve
49 | - Then type exit to close the shell and terminate the tmux session
50 | 
51 | ### Option 2: Kill the session directly
52 | 
53 | If you don’t want to reattach, you can kill the session from a new shell:
54 | ```bash
55 | x2ssh devgpu004.rva5.facebook.com
56 | tmux kill-session -t mdbook
57 | ```
58 | 
59 | ### Optional: View active tmux sessions
60 | ```bash
61 | tmux ls
62 | ```
63 | Use this to check whether the mdbook session is still running.
64 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/book.toml:
--------------------------------------------------------------------------------
 1 | [book]
 2 | authors = ["Shayne Fletcher"]
 3 | language = "en"
 4 | src = "src"
 5 | title = "Hyperactor Book"
 6 | 
 7 | [output.html]
 8 | git-repository-url = "https://github.com/pytorch-labs/monarch"
 9 | edit-url-template = "https://github.com/pytorch-labs/monarch/edit/main/books/hyperactor-book/src/{path}"
10 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/src/SUMMARY.md:
--------------------------------------------------------------------------------
 1 | # Summary
 2 | 
 3 | - [Introduction](./introduction.md)
 4 | - [References](references/index.md)
 5 |   - [Syntax](references/syntax.md)
 6 |   - [WorldId](references/world_id.md)
 7 |   - [ProcId](references/proc_id.md)
 8 |   - [ActorId](references/actor_id.md)
 9 |   - [PortId](references/port_id.md)
10 |   - [GangId](references/gang_id.md)
11 |   - [Reference](references/reference.md)
12 |   - [Typed References](references/typed_refs.md)
13 | - [Mailboxes and Routers](mailboxes/index.md)
14 |   - [Ports](mailboxes/ports.md)
15 |   - [MailboxSender](mailboxes/mailbox_sender.md)
16 |   - [Reconfigurable Senders](mailboxes/reconfigurable_sender.md)
17 |   - [MailboxServer](mailboxes/mailbox_server.md)
18 |   - [MailboxClient](mailboxes/mailbox_client.md)
19 |   - [Mailbox](mailboxes/mailbox.md)
20 |   - [Delivery Semantics](mailboxes/delivery.md)
21 |   - [Multiplexers](mailboxes/multiplexer.md)
22 |   - [Routers](mailboxes/routers.md)
23 | - [Actors](actors/index.md)
24 |   - [Actor](actors/actor.md)
25 |   - [Handler](actors/handler.md)
26 |   - [RemoteableActor](actors/remotable_actor.md)
27 |   - [Checkpointable](actors/checkpointable.md)
28 |   - [RemoteActor](actors/remote_actor.md)
29 |   - [Binds](actors/binds.md)
30 |   - [RemoteHandles](actors/remote_handles.md)
31 |   - [ActorHandle](actors/actor_handle.md)
32 |   - [Actor Lifecycle](actors/actor_lifecycle.md)
33 | - [Macros](macros/index.md)
34 |   - [`#[derive(Handler)]`](macros/handler.md)
35 |   - [`#[derive(HandleClient)]`](macros/handle_client.md)
36 |   - [`#[derive(RefClient)]`](macros/ref_client.md)
37 |   - [`#[derive(Named)]`](macros/named.md)
38 |   - [`#[export]`](macros/export.md)
39 |   - [`#[forward]`](macros/forward.md)
40 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/src/actors/binds.md:
--------------------------------------------------------------------------------
 1 | # Binds
 2 | 
 3 | The `Binds` trait defines how an actor's ports are associated with the message types it can receive remotely.
 4 | ```rust
 5 | pub trait Binds<A: Actor>: RemoteActor {
 6 |     fn bind(ports: &Ports<A>);
 7 | }
 8 | ```
 9 | Implementing `Binds<A>` allows the system to determine which messages can be routed to an actor instance of type `A`.
10 | 
11 | ## Code Generation
12 | 
13 | In most cases, you do not implement this trait manually. Instead, the `#[export]` macro generates the appropriate `Binds<A>` implementation by registering the actor's supported message types.
14 | 
15 | For example:
16 | ```rust
17 | #[hyperactor::export(
18 |     spawn = true,
19 |     handlers = [ShoppingList],
20 | )]
21 | struct ShoppingListActor;
22 | ```
23 | Expands to:
24 | ```rust
25 | impl Binds<ShoppingListActor> for ShoppingListActor {
26 |     fn bind(ports: &Ports<Self>) {
27 |         ports.bind::<ShoppingList>();
28 |     }
29 | }
30 | ```
31 | This ensures that the actor is correctly wired to handle messages of type `ShoppingList` when used in a remote messaging context.
32 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/src/actors/checkpointable.md:
--------------------------------------------------------------------------------
 1 | # Checkpointable
 2 | 
 3 | The `Checkpointable` trait enables an actor to define how its internal state can be saved and restored. This allows actors to participate in checkpointing and recovery mechanisms when supported by the surrounding system.
 4 | 
 5 | ## Trait definition
 6 | ```rust
 7 | #[async_trait]
 8 | pub trait Checkpointable: Send + Sync + Sized {
 9 |     type State: RemoteMessage;
10 | 
11 |     async fn save(&self) -> Result<Self::State, CheckpointError>;
12 |     async fn load(state: Self::State) -> Result<Self, CheckpointError>;
13 | }
14 | ```
15 | 
16 | ## Associated Type
17 | 
18 | - `type State`: A serializable type representing the object's saved state. This must implement `RemoteMessage` so it can serialized and transmitted.
19 | 
20 | ## `save`
21 | 
22 | Persists the current state of the component. Returns the Returns a `Self::State` value. If the operation fails, returns `CheckpointError::Save`.
23 | 
24 | ## `load`
25 | 
26 | Reconstructs a new instance from a previously saved `Self::State`. If deserialization or reconstruction fails, returns `CheckpointError::Load`.
27 | 
28 | ## `CheckpointError`
29 | 
30 | Errors returned by save and load operations:
31 | ```rust
32 | pub enum CheckpointError {
33 |     Save(anyhow::Error),
34 |     Load(SeqId, anyhow::Error),
35 | }
36 | ```
37 | 
38 | ## Blanket Implementation
39 | 
40 | Any type `T` that implements `RemoteMessage` and `Clone` automatically satisfies `Checkpointable`:
41 | ```rust
42 | #[async_trait]
43 | impl<T> Checkpointable for T
44 | where
45 |     T: RemoteMessage + Clone,
46 | {
47 |     type State = T;
48 | 
49 |     async fn save(&self) -> Result<Self::State, CheckpointError> {
50 |         Ok(self.clone())
51 |     }
52 | 
53 |     async fn load(state: Self::State) -> Result<Self, CheckpointError> {
54 |         Ok(state)
55 |     }
56 | }
57 | ```
58 | This implementation uses `clone()` to produce a checkpoint and simply returns the cloned state in load.
59 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/src/actors/index.md:
--------------------------------------------------------------------------------
 1 | # Actors
 2 | 
 3 | Hyperactor programs are structured around actors: isolated state machines that process messages asynchronously.
 4 | 
 5 | Each actor runs in isolation, and maintains private internal state. Actors interact with the outside world through typed message ports and follow strict lifecycle semantics managed by the runtime.
 6 | 
 7 | This chapter introduces the actor system in hyperactor. We'll cover:
 8 | 
 9 | - The [`Actor`](./actor.md) trait and its lifecycle hooks
10 | - The [`Handler`](./handler.md) trait for defining message-handling behavior
11 | - The [`RemotableActor`](./remotable_actor.md) trait for enabling remote spawning
12 | - The [`Checkpointable`](./checkpointable.md) trait for supporting actor persistence and recovery
13 | - The [`RemoteActor`](./remote_actor.md) marker trait for remotely referencable types
14 | - The [`Binds`](./binds.md) trait for wiring exported ports to reference types
15 | - The [`RemoteHandles`](./remote_handles.md) trait for associating message types with a reference
16 | - The [`ActorHandle`](./actor_handle.md) type for referencing and communicating with running actors
17 | - [Actor Lifecycle](./lifecycle.md), including `Signal` and `ActorStatus`
18 | 
19 | Actors are instantiated with parameters and bound to mailboxes, enabling reliable message-passing. The runtime builds upon this foundation to support supervision, checkpointing, and remote interaction via typed references.
20 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/src/actors/remote_actor.md:
--------------------------------------------------------------------------------
 1 | # RemoteActor
 2 | 
 3 | ```rust
 4 | pub trait RemoteActor: Named + Send + Sync {}
 5 | ```
 6 | This is a marker trait indicating that a type is eligible to serve as a reference to a remote actor (i.e., an actor that may reside on a different proc).
 7 | 
 8 | It requires:
 9 | - `Named`: the type must provide a static name.
10 | - `Send + Sync`: the type must be safely transferable and shareable across threads.
11 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/src/actors/remote_handles.md:
--------------------------------------------------------------------------------
 1 | # RemoteHandles
 2 | 
 3 | The `RemoteHandles<M>` trait is a marker used to declare that a given `RemoteActor` type can handle messages of type `M`.
 4 | ```rust
 5 | pub trait RemoteHandles<M: RemoteMessage>: RemoteActor {}
 6 | ```
 7 | 
 8 | An implementation like:
 9 | ```rust
10 | impl RemoteHandles<ShoppingList> for ShoppingListActor {}
11 | ```
12 | means that `ShoppingListActor` is known to handle the `ShoppingList` message type.
13 | 
14 | These implementations are typically generated by the `#[export(handlers = [...])]` macro, and are not written by hand.
15 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/src/introduction.md:
--------------------------------------------------------------------------------
1 | # Introduction
2 | 
3 | This book describes the design and implementation of the hyperactor runtime.
4 | 
5 | The goal is to provide a clear, structured explanation of how actors communicate safely and efficiently across distributed systems using hyperactor’s abstractions.
6 | 
7 | We hope this becomes the book we wish we had when we started working with Monarch. Work in progress.
8 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/src/macros/forward.md:
--------------------------------------------------------------------------------
 1 | # `#[forward]`
 2 | 
 3 | The `#[hyperactor::forward]` macro connects a user-defined handler trait implementation (like `ShoppingListHandler`) to the core `Handler<T>` trait required by the runtime.
 4 | 
 5 | In short, it generates the boilerplate needed to route incoming messages of type `T` to your high-level trait implementation.
 6 | 
 7 | ## What it generates
 8 | 
 9 | The macro expands to:
10 | ```rust
11 | #[async_trait]
12 | impl Handler<ShoppingList> for ShoppingListActor {
13 |     async fn handle(&mut self, ctx: &Context<Self>, message: ShoppingList) -> Result<(), Error> {
14 |         <Self as ShoppingListHandler>::handle(self, ctx, message).await
15 |     }
16 | }
17 | ```
18 | This avoids having to manually match on enum variants or duplicate message logic.
19 | 
20 | ## When to use it
21 | 
22 | Use `#[forward(MessageType)]` when:
23 | 
24 | - You’ve defined a custom trait (e.g., `ShoppingListHandler`)
25 | - You’re handling a message enum (like `ShoppingList`)
26 | - You want the runtime to route messages to your trait automatically.
27 | 
28 | This is most often used alongside `#[derive(Handler)]`, which generates the corresponding handler and client traits for a user-defined message enum.
29 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/src/macros/index.md:
--------------------------------------------------------------------------------
 1 | # Macros
 2 | 
 3 | This section documents the macros provided by hyperactor for actor and message integration.
 4 | 
 5 | These macros support a complete message-passing workflow: from defining message enums and generating client APIs, to routing messages and exporting actors for dynamic or remote use.
 6 | 
 7 | - [`#[derive(Handler)]`](handler.md) — generate message handling and client traits for actor enums
 8 | - [`#[derive(HandleClient)]`](handle_client.md) — implement the generated client trait for `ActorHandle<T>`
 9 | - [`#[derive(RefClient)]`](ref_client.md) — implement the generated client trait for `ActorRef<T>`
10 | - [`#[derive(Named)]`](named.md) — give a type a globally unique name and port for routing and reflection
11 | - [`#[export]`](export.md) — make an actor remotely spawnable and routable by registering its type, handlers, and and optionally spawnable from outside the current runtime
12 | - [`#[forward]`](forward.md) — route messages to a user-defined handler trait implementation
13 | 
14 | ## Macro Summary
15 | 
16 | - **`#[derive(Handler)]`**
17 |   Generates handler and client traits for a message enum.
18 | 
19 | - **`#[derive(HandleClient)]`**
20 |   Implements the client trait for `ActorHandle<T>`.
21 | 
22 | - **`#[derive(RefClient)]`**
23 |   Implements the client trait for `ActorRef<T>`.
24 | 
25 | - **`#[derive(Named)]`**
26 |   Registers the type with a globally unique name and port.
27 | 
28 | - **`#[export]`**
29 |   Makes an actor spawnable and routable via inventory.
30 | 
31 | - **`#[forward]`**
32 |   Forwards messages to a user-defined handler trait implementation.
33 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/src/macros/ref_client.md:
--------------------------------------------------------------------------------
 1 | # `#[derive(RefClient)]`
 2 | 
 3 | While `#[derive(HandleClient)]` enables calling the generated client trait on `ActorHandle<T>`, there are cases where you don’t have a handle, only a reference to an actor (`ActorRef<T>`). This is where `#[derive(RefClient)]` comes in.
 4 | 
 5 | ## What It Adds
 6 | 
 7 | `#[derive(RefClient)]` generates the following implementation:
 8 | ```rust
 9 | impl<T> ShoppingListClient for ActorRef<T>
10 | where
11 |   T: ShoppingListHandler + Send + Sync + 'static
12 | ```
13 | This allows you to invoke methods like `.add(...)` or `.list(...)` directly on an `ActorRef<T>`.
14 | 
15 | In other words, `RefClient` connects the generated `ShoppingListClient` interface (from `Handler`) to the `ActorRef<T>` type, which refers to a remote actor.
16 | 
17 | ## Generated Implementation (simplified)
18 | 
19 | ```rust
20 | use async_trait::async_trait;
21 | use hyperactor::{
22 |     ActorRef,
23 |     anyhow::Error,
24 |     cap::{CanSend, CanOpenPort},
25 |     mailbox::open_once_port,
26 |     metrics,
27 |     Message,
28 | };
29 | 
30 | #[async_trait]
31 | impl<T> ShoppingListClient for ActorRef<T>
32 | where
33 |     T: ShoppingListHandler + Send + Sync + 'static,
34 | {
35 |     async fn add(&self, caps: &impl CanSend, item: String) -> Result<(), Error> {
36 |         self.send(caps, ShoppingList::Add(item)).await
37 |     }
38 | 
39 |     async fn remove(&self, caps: &impl CanSend, item: String) -> Result<(), Error> {
40 |         self.send(caps, ShoppingList::Remove(item)).await
41 |     }
42 | 
43 |     async fn exists(
44 |         &self,
45 |         caps: &impl CanSend + CanOpenPort,
46 |         item: String,
47 |     ) -> Result<bool, Error> {
48 |         let (reply_to, recv) = open_once_port(caps)?;
49 |         self.send(caps, ShoppingList::Exists(item, reply_to)).await?;
50 |         Ok(recv.await?)
51 |     }
52 | 
53 |     async fn list(
54 |         &self,
55 |         caps: &impl CanSend + CanOpenPort,
56 |     ) -> Result<Vec<String>, Error> {
57 |         let (reply_to, recv) = open_once_port(caps)?;
58 |         self.send(caps, ShoppingList::List(reply_to)).await?;
59 |         Ok(recv.await?)
60 |     }
61 | }
62 | ```
63 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/src/mailboxes/index.md:
--------------------------------------------------------------------------------
 1 | # Mailboxes and Routers
 2 | 
 3 | Mailboxes are the foundation of message delivery in hyperactor. They coordinate typed ports, routing logic, forwarding, and delivery infrastructure for distributed actors.
 4 | 
 5 | This chapter introduces the components of the mailbox subsystem:
 6 | 
 7 | - [Ports](ports.md): typed channels for local message delivery
 8 | - [MailboxSender](mailbox_sender.md): trait-based abstraction for message posting
 9 | - [Reconfigurable Senders](reconfigurable_sender.md): deferred wiring and dynamic configuration
10 | - [MailboxServer](mailbox_server.md): bridging incoming message streams into mailboxes
11 | - [MailboxClient](mailbox_client.md): buffering, forwarding, and failure reporting
12 | - [Mailbox](mailbox.md): port registration, binding, and routing
13 | - [Delivery Semantics](delivery.md): envelopes, delivery errors, and failure handling
14 | - [Multiplexers](multiplexer.md): port-level dispatch to local mailboxes
15 | - [Routers](routers.md): prefix-based routing to local or remote destinations
16 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/src/references/bindings.md:
--------------------------------------------------------------------------------
1 | # Bindings
2 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/src/references/gang_id.md:
--------------------------------------------------------------------------------
 1 | # `GangId`
 2 | 
 3 | A `GangId` identifies a logical group of actors with the same name across all procs in a world. It serves as a convenient shorthand for referring to all root instances of a given actor name.
 4 | ```rust
 5 | #[derive(
 6 |     Debug,
 7 |     Serialize,
 8 |     Deserialize,
 9 |     Clone,
10 |     PartialEq,
11 |     Eq,
12 |     PartialOrd,
13 |     Hash,
14 |     Ord,
15 |     Named
16 | )]
17 | pub struct GangId(pub WorldId, pub String);
18 | ```
19 | - The first field is the WorldId.
20 | - The second field is the shared actor name.
21 | 
22 | A `GangId` is conceptually like saying: “the actor named X on every proc in world W.”
23 | 
24 | ## Construction
25 | 
26 | ```rust
27 | use hyperactor::reference::{GangId, WorldId};
28 | 
29 | let gang = GangId(WorldId("training".into()), "logger".into());
30 | ```
31 | 
32 | Or using the id! macro:
33 | ```rust
34 | use hyperactor::id;
35 | 
36 | let gang = id!(training.logger);
37 | // Equivalent to GangId(WorldId("training".into()), "logger".into())
38 | ```
39 | 
40 | ## Methods
41 | 
42 | ```rust
43 | impl GangId {
44 |     pub fn world_id(&self) -> &WorldId;
45 |     pub fn name(&self) -> &str;
46 |     pub fn actor_id(&self, rank: usize) -> ActorId;
47 |     pub fn expand(&self, world_size: usize) -> impl Iterator<Item = ActorId> + '_;
48 | }
49 | ```
50 | - `.world_id()` returns the world this gang is defined in.
51 | - `.name()` returns the shared actor name (e.g., "logger").
52 | - `.actor_id(rank)` returns the root actor on that proc.
53 | - `.expand(world_size)` yields all root ActorIds from rank `0..world_size`.
54 | 
55 | ## Semantics
56 | 
57 | - Gangs are always composed of root actors (`pid = 0`) with a common name.
58 | - Gang references are useful for broadcasting, coordination, or actor discovery.
59 | - They are lightweight and purely name-based; no state is attached to a `GangId`.
60 | 
61 | ## Traits
62 | 
63 | `GangId` implements:
64 | - `Display` — formatted as world.actor
65 | - `FromStr` — parses from strings like "training.logger"
66 | - `Ord`, `Eq`, `Hash` — usable in maps, registries, and routing
67 | - `Named` — enables type registration and metadata lookup
68 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/src/references/gangs.md:
--------------------------------------------------------------------------------
1 | # Gangs
2 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/src/references/index.md:
--------------------------------------------------------------------------------
 1 | # References
 2 | 
 3 | This section documents the reference system used throughout hyperactor to identify and communicate with distributed entities.
 4 | 
 5 | References are lightweight, serializable identifiers for **worlds**, **procs**, **actors** **ports**, and **gangs**. They are the backbone of addressing and routing in the runtime. Whether you're sending a message, spawning an actor, or broadcasting to a group, references are how you name things.
 6 | 
 7 | The reference system is:
 8 | 
 9 | - **Uniform**: All references follow a shared syntax and structure.
10 | - **Parsable**: References can be round-tripped from strings and manipulated programmatically.
11 | - **Typed**: While the `Reference` enum is typeless and dynamic, typed references like `ActorRef<A>` and `PortRef<M>` allow safe interaction in APIs.
12 | - **Orderable**: References implement a total order, enabling prefix-based routing and sorted maps.
13 | 
14 | In this section, we’ll cover:
15 | 
16 | - The [syntax](syntax.md) and string format of references
17 | - The core reference types:
18 |   - [`WorldId`](world_id.md)
19 |   - [`ProcId`](proc_id.md)
20 |   - [`ActorId`](actor_id.md)
21 |   - [`PortId`](port_id.md)
22 |   - [`GangId`](gang_id.md)
23 | - The [Reference](reference.md), which unifies all reference variants
24 |   - [Typed references](typed_refs.md) used in APIs: `ActorRef<A>`, `PortRef<M>`, and `OncePortRef<M>`
25 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/src/references/port_id.md:
--------------------------------------------------------------------------------
 1 | # `PortId`
 2 | 
 3 | A `PortId` identifies a specific port on a particular actor. Ports are the entry points through which messages are delivered to an actor, and each `PortId` is globally unique.
 4 | 
 5 | ```rust
 6 | #[derive(
 7 |     Debug,
 8 |     Serialize,
 9 |     Deserialize,
10 |     Clone,
11 |     PartialEq,
12 |     Eq,
13 |     PartialOrd,
14 |     Hash,
15 |     Ord,
16 |     Named
17 | )]
18 | pub struct PortId(pub ActorId, pub u64);
19 | ```
20 | - The first field is the owning `ActorId`.
21 | - The second field is the port number (`u64`), typically derived from the message type’s registered port.
22 | 
23 | ## Construction
24 | 
25 | ```rust
26 | use hyperactor::reference::{PortId, ActorId};
27 | 
28 | let port = PortId(actor, 42);
29 | ```
30 | Or via the `id!` macro:
31 | ```rust
32 | use hyperactor::id;
33 | 
34 | let port = id!(training[0].logger[1][42]);
35 | // Equivalent to PortId(ActorId(...), 42)
36 | ```
37 | You can also construct a PortId from an `ActorId` using `.port_id(...)`:
38 | ```rust
39 | let port = actor.port_id(42);
40 | ```
41 | 
42 | ## Methods
43 | 
44 | ```rust
45 | impl PortId {
46 |     pub fn actor_id(&self) -> &ActorId;
47 |     pub fn index(&self) -> u64;
48 |     pub fn into_actor_id(self) -> ActorId;
49 | }
50 | ```
51 | - `.actor_id()` returns the owning actor.
52 | - `.index()` returns the port number.
53 | - `.into_actor_id()` discards the port index and yields the owning actor ID.
54 | 
55 | ## Traits
56 | 
57 | `PortId` implements:
58 | - `Display` — formatted as `world[rank].actor[pid][port]`
59 | - `FromStr` — parses from strings like `"training[0].logger[1][42]"`
60 | - `Ord`, `Eq`, `Hash` — usable as map keys or for dispatch
61 | - `Named` — supports reflection and typed messaging
62 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/src/references/proc_id.md:
--------------------------------------------------------------------------------
 1 | # `ProcId`
 2 | 
 3 | A `ProcId` identifies a single runtime instance within a world. All actors exist within a proc, and message routing between actors is scoped by the proc’s identity.
 4 | ```rust
 5 | #[derive(
 6 |     Debug,
 7 |     Serialize,
 8 |     Deserialize,
 9 |     Clone,
10 |     PartialEq,
11 |     Eq,
12 |     PartialOrd,
13 |     Hash,
14 |     Ord,
15 |     Named
16 | )]
17 | pub struct ProcId(pub WorldId, pub usize);
18 | ```
19 | 
20 | ## Construction
21 | 
22 | You can construct a `ProcId` directly:
23 | ```rust
24 | use hyperactor::reference::{WorldId, ProcId};
25 | 
26 | let proc = ProcId(WorldId("training".into()), 0);
27 | ```
28 | Or statically using the `id!` macro:
29 | ```rust
30 | use hyperactor::id;
31 | 
32 | let proc = id!(training[0]); // Equivalent to ProcId(WorldId("training".into()), 0)
33 | ```
34 | 
35 | ## Methods
36 | 
37 | ```rust
38 | impl ProcId {
39 |     pub fn world_id(&self) -> &WorldId;
40 |     pub fn world_name(&self) -> &str;
41 |     pub fn rank(&self) -> usize;
42 |     pub fn actor_id(&self, name: impl Into<String>, pid: usize) -> ActorId;
43 | }
44 | ```
45 | - `.world_id()` gives the `WorldId` this proc belongs to.
46 | - `.rank()` returns the proc’s index.
47 | - `.actor_id(name, pid)` constructs an `ActorId` for an actor hosted on this proc.
48 | 
49 | # Notes
50 | 
51 | Ranks greater than or equal to `1 << (usize::BITS - 1)` are considered user-space procs. These are typically created with `WorldId::random_user_proc()` and are not assigned by the system.
52 | 
53 | ## Traits
54 | 
55 | ProcId implements:
56 | - `Display` — formatted as `world[rank]`
57 | - `FromStr` — parses from strings like "training[0]"
58 | - `Ord`, `Eq`, `Hash` — usable in maps and sorted structures
59 | - `Named` — enables port lookup and type reflection
60 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/src/references/syntax.md:
--------------------------------------------------------------------------------
 1 | # Syntax
 2 | 
 3 | References in Hyperactor follow a uniform concrete syntax that can be written as strings, parsed at runtime, or constructed statically using the `id!` macro.
 4 | 
 5 | ## String Form
 6 | 
 7 | The canonical string syntax supports hierarchical references, from worlds down to ports:
 8 | ```text
 9 | world
10 | world[rank]
11 | world[rank].actor           // actor[0]
12 | world[rank].actor[pid]
13 | world[rank].actor[pid][port]
14 | world.actor                 // gang reference
15 | ```
16 | 
17 | These forms can be used wherever a reference is accepted as a string, such as command-line arguments, config files, and logs.
18 | 
19 | Examples:
20 | 
21 | - `training` — world ID
22 | - `training[0]` — proc 0 in world `training`
23 | - `training[0].logger[1]` — actor named `logger`, pid 1
24 | - `training[0].logger[1][42]` — port 42 of that actor
25 | - `training.logger` — gang reference
26 | 
27 | The parser is robust and fails clearly on invalid syntax.
28 | 
29 | ## Runtime Parsing
30 | 
31 | The `Reference` type implements `FromStr`, so you can parse strings into references:
32 | 
33 | ```rust
34 | use hyperactor::reference::Reference;
35 | 
36 | let r: Reference = "training[2].worker[0]".parse().unwrap();
37 | ```
38 | 
39 | It returns a strongly typed enum: `Reference::Actor`, `Reference::Port`, etc.
40 | 
41 | ## Static Construction with `id!`
42 | 
43 | You can also construct references statically using the `id!` macro. This macro uses the same concrete syntax:
44 | ```rust
45 | use hyperactor::id;
46 | use hyperactor::reference::{WorldId, ProcId, ActorId, PortId, GangId};
47 | 
48 | let w: WorldId = id!(training);
49 | let p: ProcId = id!(training[0]);
50 | let a: ActorId = id!(training[0].logger[1]);
51 | let port: PortId = id!(training[0].logger[1][42]);
52 | let g: GangId = id!(training.logger);
53 | ```
54 | 
55 | The macro expands to correct type constructors and ensures compile-time validity. The `id!()` macro does not produce a `Reference` enum-it constructs the corresponding concrete type directly (e.g., `WorldId`, `ProcId`, `ActorId`). This contrasts with parsing, which always yields a `Reference`.
56 | 


--------------------------------------------------------------------------------
/books/hyperactor-book/src/references/world_id.md:
--------------------------------------------------------------------------------
 1 | # `WorldId`
 2 | 
 3 | A `WorldId` defines the top-level namespace for procs and actors. All procs, actors, ports, and gangs exist within a world.
 4 | ```rust
 5 | #[derive(
 6 |     Debug,
 7 |     Serialize,
 8 |     Deserialize,
 9 |     Clone,
10 |     PartialEq,
11 |     Eq,
12 |     PartialOrd,
13 |     Hash,
14 |     Ord,
15 |     Named
16 | )]
17 | pub struct WorldId(pub String);
18 | ```
19 | 
20 | ## Construction
21 | 
22 | A `WorldId` wraps a string and can be created directly:
23 | ```rust
24 | use hyperactor::reference::WorldId;
25 | 
26 | let world = WorldId("training".into());
27 | ```
28 | Or statically using the `id!` macro:
29 | ```rust
30 | use hyperactor::id;
31 | 
32 | let world = id!(training); // Equivalent to WorldId("training".into())
33 | ```
34 | 
35 | ## Methods
36 | 
37 | ```rust
38 | impl WorldId {
39 |     pub fn name(&self) -> &str;
40 |     pub fn proc_id(&self, index: usize) -> ProcId;
41 |     pub fn random_user_proc(&self) -> ProcId;
42 | }
43 | ```
44 | - `.name()` returns the world name string.
45 | - `.proc_id(index)` constructs a `ProcId` rooted in this world.
46 | - `.random_user_proc()` generates a `ProcId` with the high bit set, marking it as a user-space proc ID.
47 | 
48 | ## Traits
49 | 
50 | `WorldId` implements:
51 | - `Display` — string form is just the world name
52 | - `FromStr` — parses from "training" into WorldId("training")
53 | - `Ord`, `Eq`, `Hash` — suitable for use as map/set keys
54 | - `Named` — used for type reflection and message dispatch
55 | 


--------------------------------------------------------------------------------
/build-requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | setuptools
3 | setuptools-rust
4 | wheel
5 | numpy
6 | 


--------------------------------------------------------------------------------
/clippy.toml:
--------------------------------------------------------------------------------
1 | too-many-lines-threshold = 200
2 | await-holding-invalid-types = [
3 |     { path = "tracing::span::Entered", reason = "`Entered` is not aware when a function is suspended: https://docs.rs/tracing/latest/tracing/struct.Span.html#in-asynchronous-code" },
4 |     { path = "tracing::span::EnteredSpan", reason = "`EnteredSpan` is not aware when a function is suspended: https://docs.rs/tracing/latest/tracing/struct.Span.html#in-asynchronous-code" },
5 | ]
6 | 


--------------------------------------------------------------------------------
/controller/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # @generated by autocargo from //monarch/controller:[controller,controller-bin]
 2 | 
 3 | [package]
 4 | name = "controller"
 5 | version = "0.0.0"
 6 | authors = ["Meta"]
 7 | edition = "2021"
 8 | license = "BSD-3-Clause"
 9 | 
10 | [[bin]]
11 | name = "controller_bin"
12 | path = "src/main.rs"
13 | 
14 | [dependencies]
15 | anyhow = "1.0.98"
16 | async-trait = "0.1.86"
17 | bincode = "1.3.3"
18 | clap = { version = "4.5.38", features = ["derive", "env", "string", "unicode", "wrap_help"] }
19 | const_format = "0.2"
20 | hyperactor = { version = "0.0.0", path = "../hyperactor" }
21 | hyperactor_mesh = { version = "0.0.0", path = "../hyperactor_mesh" }
22 | hyperactor_multiprocess = { version = "0.0.0", path = "../hyperactor_multiprocess" }
23 | monarch_messages = { version = "0.0.0", path = "../monarch_messages" }
24 | nccl-sys = { path = "../nccl-sys" }
25 | ndslice = { version = "0.0.0", path = "../ndslice" }
26 | pyo3 = { version = "0.24", features = ["anyhow", "multiple-pymethods"] }
27 | serde = { version = "1.0.185", features = ["derive", "rc"] }
28 | serde_json = { version = "1.0.140", features = ["alloc", "float_roundtrip", "unbounded_depth"] }
29 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] }
30 | torch-sys = { path = "../torch-sys" }
31 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
32 | 
33 | [dev-dependencies]
34 | monarch_types = { version = "0.0.0", path = "../monarch_types" }
35 | torch-sys = { version = "0.0.0", path = "../torch-sys" }
36 | 


--------------------------------------------------------------------------------
/controller/build.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // This is needed due to the controller being built with torch/nccl deps due to monarch_messages.
10 | 
11 | fn main() {
12 |     // `torch-sys` will set this env var through Cargo `links` metadata.
13 |     let lib_path = std::env::var("DEP_TORCH_LIB_PATH").expect("DEP_TORCH_LIB_PATH to be set");
14 |     // Set the rpath so that the dynamic linker can find libtorch and friends.
15 |     println!("cargo::rustc-link-arg=-Wl,-rpath,{lib_path}");
16 | 
17 |     if let Ok(path) = std::env::var("DEP_NCCL_LIB_PATH") {
18 |         println!("cargo::rustc-link-arg=-Wl,-rpath,{path}");
19 |     }
20 | 
21 |     // Disable new dtags, as conda envs generally use `RPATH` over `RUNPATH`.
22 |     println!("cargo::rustc-link-arg=-Wl,--disable-new-dtags");
23 | 
24 |     println!("cargo:rustc-link-lib=lzma");
25 | }
26 | 


--------------------------------------------------------------------------------
/cuda-sys/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "cuda-sys"
 3 | version = "0.0.0"
 4 | authors = ["Facebook"]
 5 | edition = "2021"
 6 | license = "MIT"
 7 | links = "cuda"
 8 | description = "Rust FFI bindings for CUDA libraries"
 9 | 
10 | [dependencies]
11 | cxx = "1.0.119"
12 | serde = { version = "1.0.185", features = ["derive", "rc"] }
13 | 
14 | [build-dependencies]
15 | bindgen = "0.70.1"
16 | which = "6.0.3"
17 | glob = "0.3.1"
18 | 


--------------------------------------------------------------------------------
/cuda-sys/src/lib.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | /*
10 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
11 |  * All rights reserved.
12 |  *
13 |  * This source code is licensed under the BSD-style license found in the
14 |  * LICENSE file in the root directory of this source tree.
15 |  */
16 | 
17 | use cxx::ExternType;
18 | use cxx::type_id;
19 | 
20 | /// SAFETY: bindings
21 | unsafe impl ExternType for CUstream_st {
22 |     type Id = type_id!("CUstream_st");
23 |     type Kind = cxx::kind::Opaque;
24 | }
25 | 
26 | // When building with cargo, this is actually the lib.rs file for a crate.
27 | // Include the generated bindings.rs and suppress lints.
28 | #[allow(non_camel_case_types)]
29 | #[allow(non_upper_case_globals)]
30 | #[allow(non_snake_case)]
31 | mod inner {
32 |     #[cfg(cargo)]
33 |     include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
34 | }
35 | 
36 | pub use inner::*;
37 | 
38 | #[cfg(test)]
39 | mod tests {
40 |     use std::mem::MaybeUninit;
41 | 
42 |     use super::*;
43 | 
44 |     #[test]
45 |     fn sanity() {
46 |         // SAFETY: testing bindings
47 |         unsafe {
48 |             let mut version = MaybeUninit::<i32>::uninit();
49 |             let result = cuDriverGetVersion(version.as_mut_ptr());
50 |             assert_eq!(result, cudaError_enum(0));
51 |         }
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/cuda-sys/src/wrapper.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #pragma once
10 | 
11 | #include <cuda.h>
12 | #include <cuda_runtime.h>
13 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/monarch/27b2e0d942ab98fc9c3e74f4ea512d8f28d515b8/examples/__init__.py


--------------------------------------------------------------------------------
/examples/notebooks/README.md:
--------------------------------------------------------------------------------
 1 | # Example Notebooks
 2 | 
 3 | This folder contains some basic examples of using the Monarch API in jupyter notebooks.
 4 | 
 5 | ## Setup
 6 | 1. Follow the instructions outlined in ../../monarch/README.md to setup Monarch
 7 | 2. Pip install jupyter:
 8 |     `pip install jupyter notebook`
 9 | 3. Run your jupyter notebook: `jupyter notebook`
10 | 4. (optiona) In remote settings (as in a devserver), you can also port forward your jupyter notebook to your local machine. e.g.
11 |     ```
12 |     # devserver
13 |     jupyter notebook --no-browser --port=8098
14 | 
15 |     #local
16 |     ssh -N -L 8098:localhost:8098 <devserver_address>
17 | ````
18 | 5. Open localhost:8098 in your browser to see the jupyter notebook
19 | 
20 | 
21 | ## Manifest
22 | * ping_pong.ipynb - Simple hello world with Actor API + Inter Actor Communication
23 | 


--------------------------------------------------------------------------------
/hyper/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # This file is manually maintained to maintain the abilith to build hyper
 2 | # using cargo. The code is annotated with fbcode_build conditionals such that
 3 | # it works with both cargo (all oss deps) and buck (full meta deps).
 4 | [package]
 5 | name = "hyper"
 6 | version = "0.0.0"
 7 | authors = ["Facebook"]
 8 | edition = "2021"
 9 | license = "MIT"
10 | 
11 | [dependencies]
12 | anyhow = "1.0.95"
13 | async-trait = "0.1.86"
14 | chrono = { version = "=0.4.39", features = ["clock", "serde", "std"], default-features = false }
15 | clap = { version = "4.5.30", features = ["derive", "env", "string", "unicode", "wrap_help"] }
16 | console = "0.15.7"
17 | hyperactor = { path = "../hyperactor" }
18 | hyperactor_multiprocess = { path = "../hyperactor_multiprocess" }
19 | serde = { version = "1.0.185", features = ["derive", "rc"] }
20 | serde_json = { version = "1.0.132", features = ["float_roundtrip", "unbounded_depth"] }
21 | tabwriter = { version = "1.2.1", features = ["ansi_formatting"] }
22 | tokio = { version = "1.41.0", features = ["full", "test-util", "tracing"] }
23 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
24 | 
25 | [lints]
26 | rust = { unexpected_cfgs = { check-cfg = ["cfg(fbcode_build)"], level = "warn" } }
27 | 


--------------------------------------------------------------------------------
/hyper/src/commands.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | pub mod demo;
10 | pub mod procs;
11 | pub mod serve;
12 | pub mod show;
13 | #[cfg(fbcode_build)]
14 | pub mod top;
15 | 


--------------------------------------------------------------------------------
/hyper/src/commands/serve.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | use std::time::Duration;
10 | 
11 | use hyperactor::channel::ChannelAddr;
12 | use hyperactor::channel::ChannelTransport;
13 | use hyperactor_multiprocess::system::System;
14 | 
15 | // The commands in the demo spawn temporary actors the join a system.
16 | // Set a long heartbeat duration so we do not check heartbeats for these actors.
17 | // [`Duration::from_secs`] is a stable API. Any APIs with units bigger than secs are unstable.
18 | static LONG_DURATION: Duration = Duration::from_secs(500000);
19 | 
20 | #[derive(clap::Args, Debug)]
21 | pub struct ServeCommand {
22 |     /// The address to serve the system actor on. If not specified, the local
23 |     /// host will be used.
24 |     #[arg(short, long)]
25 |     addr: Option<ChannelAddr>,
26 | }
27 | 
28 | impl ServeCommand {
29 |     pub async fn run(self) -> anyhow::Result<()> {
30 |         let addr = self.addr.unwrap_or(ChannelAddr::any(ChannelTransport::Tcp));
31 |         let handle = System::serve(addr, LONG_DURATION, LONG_DURATION).await?;
32 |         eprintln!("serve: {}", handle.local_addr());
33 |         handle.await;
34 |         Ok(())
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/hyper/src/lib.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | pub mod tui;
10 | pub mod utils;
11 | 


--------------------------------------------------------------------------------
/hyper/src/tui/mod.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #[cfg(fbcode_build)]
10 | pub mod top;
11 | 


--------------------------------------------------------------------------------
/hyper/src/utils/mod.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | pub mod system_address;
10 | 


--------------------------------------------------------------------------------
/hyperactor/src/checkpoint.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | //! Checkpoint functionality for various objects to save and load states.
10 | 
11 | use std::fmt::Debug;
12 | 
13 | use async_trait::async_trait;
14 | 
15 | use crate::RemoteMessage;
16 | use crate::mailbox::log::SeqId;
17 | 
18 | /// Errors that occur during checkpoint operations.
19 | /// This enum is marked non-exhaustive to allow for extensibility.
20 | #[derive(thiserror::Error, Debug)]
21 | #[non_exhaustive]
22 | pub enum CheckpointError {
23 |     /// An error occured during saving checkpoints.
24 |     #[error("save")]
25 |     Save(#[source] anyhow::Error),
26 | 
27 |     /// An error occured during loading checkpoints.
28 |     #[error("load: {0}")]
29 |     Load(SeqId, #[source] anyhow::Error),
30 | }
31 | 
32 | /// [`Checkpoint`] is used to save the state of an instance so that it can be restored later.
33 | #[async_trait]
34 | pub trait Checkpointable: Send + Sync + Sized {
35 |     /// The type of the state that is saved. The state can be serialized and deserialized
36 |     /// from persistent storage.
37 |     type State: RemoteMessage;
38 | 
39 |     /// Saves the current state.
40 |     async fn save(&self) -> Result<Self::State, CheckpointError>;
41 | 
42 |     /// Loads the a state to restore the instance.
43 |     async fn load(state: Self::State) -> Result<Self, CheckpointError>;
44 | }
45 | 


--------------------------------------------------------------------------------
/hyperactor/src/mailbox/mailbox_admin_message.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | use serde::Deserialize;
10 | use serde::Serialize;
11 | 
12 | pub use crate as hyperactor;
13 | use crate::HandleClient;
14 | use crate::Handler;
15 | use crate::Named;
16 | use crate::ProcId;
17 | use crate::RefClient;
18 | use crate::mailbox::ChannelAddr;
19 | 
20 | /// Messages relating to mailbox administration.
21 | #[derive(
22 |     Handler,
23 |     HandleClient,
24 |     RefClient,
25 |     Debug,
26 |     Serialize,
27 |     Deserialize,
28 |     Clone,
29 |     PartialEq,
30 |     Named
31 | )]
32 | pub enum MailboxAdminMessage {
33 |     /// An address update.
34 |     UpdateAddress {
35 |         /// The ID of the proc.
36 |         proc_id: ProcId,
37 | 
38 |         /// The address at which it listens.
39 |         addr: ChannelAddr,
40 |     },
41 | }
42 | 


--------------------------------------------------------------------------------
/hyperactor/src/metrics.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | //! A bunch of statily defined metrics. Defined here because they are used in
10 | //! both macros and handwritten code.
11 | 
12 | use hyperactor_telemetry::declare_static_counter;
13 | use hyperactor_telemetry::declare_static_timer;
14 | use hyperactor_telemetry::declare_static_up_down_counter;
15 | 
16 | declare_static_counter!(MESSAGES_SENT, "messages_sent");
17 | declare_static_counter!(MESSAGES_RECEIVED, "messages_received");
18 | declare_static_counter!(MESSAGE_HANDLE_ERRORS, "message_handle_errors");
19 | declare_static_counter!(MESSAGE_RECEIVE_ERRORS, "message_receive_errors");
20 | declare_static_up_down_counter!(MESSAGE_QUEUE_SIZE, "message_queue_size");
21 | declare_static_timer!(
22 |     MESSAGE_HANDLER_DURATION,
23 |     "message_handler_duration",
24 |     hyperactor_telemetry::TimeUnit::Nanos
25 | );
26 | 
27 | declare_static_timer!(
28 |     ACTOR_STATUS,
29 |     "actor.status",
30 |     hyperactor_telemetry::TimeUnit::Nanos
31 | );
32 | 


--------------------------------------------------------------------------------
/hyperactor/src/spawn.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | use std::sync::Arc;
10 | use std::sync::atomic::AtomicU64;
11 | use std::sync::atomic::Ordering;
12 | 
13 | use async_trait::async_trait;
14 | 
15 | use crate::actor::Actor;
16 | use crate::actor::ActorHandle;
17 | use crate::cap::sealed::CanSpawn;
18 | use crate::mailbox::BoxedMailboxSender;
19 | use crate::reference::ActorId;
20 | #[derive(Debug)]
21 | struct LocalSpawnerState {
22 |     root: ActorId,
23 |     sender: BoxedMailboxSender,
24 |     next_pid: AtomicU64,
25 | }
26 | 
27 | #[derive(Clone, Debug)]
28 | pub(crate) struct LocalSpawner(Option<Arc<LocalSpawnerState>>);
29 | 
30 | impl LocalSpawner {
31 |     pub(crate) fn new(root: ActorId, sender: BoxedMailboxSender) -> Self {
32 |         Self(Some(Arc::new(LocalSpawnerState {
33 |             root,
34 |             sender,
35 |             next_pid: AtomicU64::new(1),
36 |         })))
37 |     }
38 | 
39 |     pub(crate) fn new_panicking() -> Self {
40 |         Self(None)
41 |     }
42 | }
43 | 
44 | #[async_trait]
45 | impl CanSpawn for LocalSpawner {
46 |     async fn spawn<A: Actor>(&self, params: A::Params) -> ActorHandle<A::Message> {
47 |         let state = self.0.as_ref().expect("invalid spawner");
48 |         let pid = state.next_pid.fetch_add(1, Ordering::Relaxed);
49 |         let actor_id = state.root.child_id(pid);
50 |         A::do_spawn(state.sender.clone(), actor_id, params, self.clone())
51 |             .await
52 |             .unwrap()
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/hyperactor/src/supervision.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | //! Messages used in supervision.
10 | 
11 | use std::fmt::Debug;
12 | 
13 | use serde::Deserialize;
14 | use serde::Serialize;
15 | 
16 | use crate as hyperactor; // for macros
17 | use crate::Named;
18 | use crate::actor::ActorStatus;
19 | use crate::reference::ActorId;
20 | 
21 | /// This is the local actor supervision event. Child actor will propagate this event to its parent.
22 | #[derive(Clone, Debug, Serialize, Deserialize, Named, PartialEq, Eq)]
23 | pub struct ActorSupervisionEvent {
24 |     /// The actor id of the child actor where the event is triggered.
25 |     actor_id: ActorId,
26 |     /// Status of the child actor.
27 |     actor_status: ActorStatus,
28 | }
29 | 
30 | impl ActorSupervisionEvent {
31 |     /// Create a new actor supervision event.
32 |     pub fn new(actor_id: ActorId, actor_status: ActorStatus) -> Self {
33 |         Self {
34 |             actor_id,
35 |             actor_status,
36 |         }
37 |     }
38 |     /// Get the actor id of the supervision event.
39 |     pub fn actor_id(&self) -> &ActorId {
40 |         &self.actor_id
41 |     }
42 |     /// Get the actor status of the supervision event.
43 |     pub fn actor_status(&self) -> &ActorStatus {
44 |         &self.actor_status
45 |     }
46 | 
47 |     /// Consume this event to a tuple.
48 |     pub fn into_inner(self) -> (ActorId, ActorStatus) {
49 |         (self.actor_id, self.actor_status)
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/hyperactor/src/sync.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | //! Synchronization primitives that are used by Hyperactor.
10 | //!
11 | //! These are used in related Hyperactor crates as well, and are thus part of the
12 | //! public API. However, they should not be considered a stable part of the Hyperactor
13 | //! API itself, and they may be moved to a different crate in the future.
14 | 
15 | pub mod flag;
16 | pub mod monitor;
17 | 


--------------------------------------------------------------------------------
/hyperactor/src/test_utils.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | /// PingPongActor test util.
10 | pub mod pingpong;
11 | /// ProcSupervisionCoordinator test util.
12 | pub mod proc_supervison;
13 | /// Used to verify behaviors related to process.
14 | pub mod process_assertion;
15 | 


--------------------------------------------------------------------------------
/hyperactor/src/test_utils/process_assertion.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | use std::future::Future;
10 | 
11 | use nix::sys::wait::WaitStatus;
12 | use nix::sys::wait::waitpid;
13 | use nix::unistd::ForkResult;
14 | use nix::unistd::fork;
15 | 
16 | /// Fork a child process, execute the given function in that process, and verify
17 | /// that the process exits with the given exit code.
18 | pub async fn assert_termination<F, Fut>(f: F, expected_code: i32) -> anyhow::Result<()>
19 | where
20 |     F: FnOnce() -> Fut,
21 |     Fut: Future<Output = ()>,
22 | {
23 |     // SAFETY: for unit test process assertion.
24 |     unsafe {
25 |         match fork() {
26 |             Ok(ForkResult::Parent { child, .. }) => match waitpid(child, None)? {
27 |                 WaitStatus::Exited(_, exit_code) => {
28 |                     anyhow::ensure!(exit_code == expected_code);
29 |                     Ok(())
30 |                 }
31 |                 status => Err(anyhow::anyhow!(
32 |                     "didn't receive expected status. got: {:?}",
33 |                     status
34 |                 )),
35 |             },
36 |             Ok(ForkResult::Child) => Ok(f().await),
37 |             Err(_) => Err(anyhow::anyhow!("fork failed")),
38 |         }
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/hyperactor_macros/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # @generated by autocargo from //monarch/hyperactor_macros:[hyperactor_macros,hyperactor_macros_test]
 2 | 
 3 | [package]
 4 | name = "hyperactor_macros"
 5 | version = "0.0.0"
 6 | authors = ["Facebook <opensource+crates-hyperactor-macros@fb.com>"]
 7 | edition = "2021"
 8 | description = "macros to support the Hyperactor actors and data exchange"
 9 | repository = "https://github.com/pytorch-labs/monarch/"
10 | license = "BSD-3-Clause"
11 | 
12 | [lib]
13 | test = false
14 | doctest = false
15 | proc-macro = true
16 | 
17 | [[test]]
18 | name = "hyperactor_macros_test"
19 | path = "tests/basic.rs"
20 | 
21 | [dependencies]
22 | convert_case = "0.6"
23 | indoc = "2.0.2"
24 | proc-macro2 = { version = "1.0.70", features = ["span-locations"] }
25 | quote = "1.0.29"
26 | syn = { version = "2.0.101", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
27 | 
28 | [dev-dependencies]
29 | anyhow = "1.0.98"
30 | async-trait = "0.1.86"
31 | hyperactor = { version = "0.0.0", path = "../hyperactor" }
32 | serde = { version = "1.0.185", features = ["derive", "rc"] }
33 | timed_test = { version = "0.0.0", path = "../timed_test" }
34 | tokio = { version = "1.37.0", features = ["full", "test-util", "tracing"] }
35 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
36 | 


--------------------------------------------------------------------------------
/hyperactor_macros/build.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | fn main() {
10 |     println!("cargo::rustc-check-cfg=cfg(enable_hyperactor_message_logging)");
11 | }
12 | 


--------------------------------------------------------------------------------
/hyperactor_mesh/src/metrics.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | use hyperactor_telemetry::*;
10 | 
11 | declare_static_timer!(
12 |     ACTOR_MESH_CAST_DURATION,
13 |     "actor_mesh_cast_duration",
14 |     TimeUnit::Micros
15 | );
16 | 


--------------------------------------------------------------------------------
/hyperactor_mesh/src/test_utils.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | use async_trait::async_trait;
10 | use hyperactor::Actor;
11 | use hyperactor::Bind;
12 | use hyperactor::Context;
13 | use hyperactor::Handler;
14 | use hyperactor::Named;
15 | use hyperactor::Unbind;
16 | use serde::Deserialize;
17 | use serde::Serialize;
18 | 
19 | /// Message that can be sent to an EmptyActor.
20 | #[derive(Serialize, Deserialize, Debug, Named, Clone, Bind, Unbind)]
21 | pub struct EmptyMessage();
22 | 
23 | /// No-op actor.
24 | #[derive(Debug, PartialEq)]
25 | #[hyperactor::export(
26 |     handlers = [
27 |         EmptyMessage { cast = true },
28 |     ],
29 | )]
30 | pub struct EmptyActor();
31 | 
32 | #[async_trait]
33 | impl Actor for EmptyActor {
34 |     type Params = ();
35 | 
36 |     async fn new(_: ()) -> Result<Self, anyhow::Error> {
37 |         Ok(Self())
38 |     }
39 | }
40 | 
41 | #[async_trait]
42 | impl Handler<EmptyMessage> for EmptyActor {
43 |     async fn handle(&mut self, _: &Context<Self>, _: EmptyMessage) -> Result<(), anyhow::Error> {
44 |         Ok(())
45 |     }
46 | }
47 | hyperactor::remote!(EmptyActor);
48 | 


--------------------------------------------------------------------------------
/hyperactor_mesh/test/bootstrap.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | /// This is an "empty shell" bootstrap process,
10 | /// simply invoking [`hyperactor_mesh::bootstrap_or_die`].
11 | #[tokio::main]
12 | async fn main() {
13 |     hyperactor_mesh::bootstrap_or_die().await;
14 | }
15 | 


--------------------------------------------------------------------------------
/hyperactor_mesh/test/process_allocator_cleanup/process_allocator_test_bootstrap.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | /// A simple bootstrap binary that writes logs out to a file. This is useful for
10 | /// debugging, as normally the ProcessAllocator children logs are piped back to
11 | /// ProcessAllocator. When we are testing what happens when we sigkill
12 | /// ProcessAllocator, we want to see what is happening on the children.
13 | #[tokio::main]
14 | async fn main() {
15 |     // Initialize tracing to a separate log file per child
16 |     let pid = std::process::id();
17 |     let log_file_path = format!("/tmp/child_log{}", pid);
18 |     let log_file = std::fs::File::create(&log_file_path).expect("Failed to create log file");
19 | 
20 |     tracing_subscriber::fmt()
21 |         .with_writer(log_file)
22 |         .with_ansi(false) // No color codes in file
23 |         .init();
24 | 
25 |     // Let the user know where to find our logs
26 |     eprintln!("CHILD_LOG_FILE:{}: {}", pid, log_file_path);
27 | 
28 |     hyperactor_mesh::bootstrap_or_die().await;
29 | }
30 | 


--------------------------------------------------------------------------------
/hyperactor_mesh_macros/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # @generated by autocargo from //monarch/hyperactor_mesh_macros:hyperactor_mesh_macros
 2 | 
 3 | [package]
 4 | name = "hyperactor_mesh_macros"
 5 | version = "0.0.0"
 6 | authors = ["Meta"]
 7 | edition = "2021"
 8 | license = "BSD-3-Clause"
 9 | 
10 | [lib]
11 | test = false
12 | doctest = false
13 | proc-macro = true
14 | 
15 | [dependencies]
16 | ndslice = { version = "0.0.0", path = "../ndslice" }
17 | proc-macro2 = { version = "1.0.70", features = ["span-locations"] }
18 | quote = "1.0.29"
19 | 


--------------------------------------------------------------------------------
/hyperactor_mesh_macros/src/lib.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // Clippy can't see through quote! to use of proc-macro2
10 | #![allow(unused_crate_dependencies)]
11 | 
12 | extern crate proc_macro;
13 | 
14 | use proc_macro::TokenStream;
15 | use quote::quote;
16 | 
17 | /// Parse a compact selection expression into a [`Selection`]. See
18 | /// [`selection::parse`] for syntax documentation.
19 | #[proc_macro]
20 | pub fn sel(input: TokenStream) -> TokenStream {
21 |     match ndslice::selection::token_parser::parse_tokens(input.into()) {
22 |         Ok(selection) => {
23 |             let tokens = ndslice::selection::token_parser::selection_to_tokens(&selection);
24 |             quote!(#tokens).into()
25 |         }
26 |         Err(e) => {
27 |             let msg = format!("sel! parse failed: {}", e);
28 |             quote!(compile_error!(#msg)).into()
29 |         }
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/hyperactor_multiprocess/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # @generated by autocargo from //monarch/hyperactor_multiprocess:hyperactor_multiprocess
 2 | 
 3 | [package]
 4 | name = "hyperactor_multiprocess"
 5 | version = "0.0.0"
 6 | authors = ["Meta"]
 7 | edition = "2021"
 8 | license = "BSD-3-Clause"
 9 | 
10 | [dependencies]
11 | anyhow = "1.0.98"
12 | async-trait = "0.1.86"
13 | bincode = "1.3.3"
14 | dashmap = { version = "5.5.3", features = ["rayon", "serde"] }
15 | enum-as-inner = "0.6.0"
16 | futures = { version = "0.3.30", features = ["async-await", "compat"] }
17 | hyperactor = { version = "0.0.0", path = "../hyperactor" }
18 | hyperactor_mesh = { version = "0.0.0", path = "../hyperactor_mesh" }
19 | hyperactor_telemetry = { version = "0.0.0", path = "../hyperactor_telemetry" }
20 | remoteprocess = { git = "https://github.com/technicianted/remoteprocess", rev = "72505594a19d80c07df6f1dc4a80556b7e462148" }
21 | serde = { version = "1.0.185", features = ["derive", "rc"] }
22 | thiserror = "2.0.12"
23 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] }
24 | tokio-retry = "0.3"
25 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
26 | 
27 | [dev-dependencies]
28 | maplit = "1.0"
29 | rand = { version = "0.8", features = ["small_rng"] }
30 | regex = "1.11.1"
31 | serde_json = { version = "1.0.140", features = ["alloc", "float_roundtrip", "unbounded_depth"] }
32 | timed_test = { version = "0.0.0", path = "../timed_test" }
33 | tracing-test = { version = "0.2.3", features = ["no-env-filter"] }
34 | 
35 | [target.'cfg(not(target_os = "linux"))'.dependencies]
36 | py-spy = { git = "https://github.com/technicianted/py-spy", rev = "8f74f3e4f955fee57f0d4a8103511ee788348a2a" }
37 | 
38 | [target.'cfg(target_os = "linux")'.dependencies]
39 | py-spy = { git = "https://github.com/technicianted/py-spy", rev = "8f74f3e4f955fee57f0d4a8103511ee788348a2a", features = ["unwind"] }
40 | 


--------------------------------------------------------------------------------
/hyperactor_multiprocess/src/lib.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | //! Multiprocess actor system and support.
10 | 
11 | #![feature(assert_matches)]
12 | #![feature(never_type)]
13 | #![deny(missing_docs)]
14 | 
15 | /// TODO: add missing doc.
16 | pub mod ping_pong;
17 | pub mod proc_actor;
18 | /// TODO: add missing doc.
19 | pub mod scheduler;
20 | /// TODO: add missing doc.
21 | pub mod supervision;
22 | /// TODO: add missing doc.
23 | pub mod system;
24 | pub mod system_actor;
25 | 
26 | /// py-spy wrapper.
27 | pub mod pyspy;
28 | 
29 | pub use hyperactor::actor;
30 | pub use system::System;
31 | 


--------------------------------------------------------------------------------
/hyperactor_multiprocess/src/scheduler.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | use async_trait::async_trait;
10 | 
11 | /// TODO: add missing doc
12 | #[async_trait]
13 | pub trait Scheduler {
14 |     /// TODO: add missing doc
15 |     type GangHandle;
16 |     /// TODO: add missing doc
17 |     async fn schedule_gang(&self, size: u64) -> Result<Self::GangHandle, anyhow::Error>;
18 | }
19 | 
20 | /// TODO: add missing doc
21 | pub struct UnimplementedScheduler;
22 | 
23 | #[async_trait]
24 | impl Scheduler for UnimplementedScheduler {
25 |     type GangHandle = !;
26 | 
27 |     async fn schedule_gang(&self, _size: u64) -> Result<Self::GangHandle, anyhow::Error> {
28 |         unimplemented!()
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/hyperactor_telemetry/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # @generated by autocargo from //monarch/hyperactor_telemetry:hyperactor_telemetry
 2 | 
 3 | [package]
 4 | name = "hyperactor_telemetry"
 5 | version = "0.0.0"
 6 | authors = ["Meta"]
 7 | edition = "2021"
 8 | license = "BSD-3-Clause"
 9 | 
10 | [dependencies]
11 | anyhow = "1.0.98"
12 | dashmap = { version = "5.5.3", features = ["rayon", "serde"] }
13 | fbinit = { version = "0.2.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main", optional = true }
14 | hdrhistogram = "7.5"
15 | lazy_static = "1.5"
16 | opentelemetry = "0.29"
17 | opentelemetry_sdk = { version = "0.29.0", features = ["rt-tokio"] }
18 | rand = { version = "0.8", features = ["small_rng"] }
19 | scuba = { version = "0.1.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main", optional = true }
20 | serde = { version = "1.0.185", features = ["derive", "rc"] }
21 | serde_json = { version = "1.0.140", features = ["alloc", "float_roundtrip", "unbounded_depth"] }
22 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] }
23 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
24 | tracing-appender = "0.2.3"
25 | tracing-core = { version = "0.1.33", features = ["valuable"] }
26 | tracing-glog = { version = "0.4.1", features = ["ansi", "tracing-log"] }
27 | tracing-subscriber = { version = "0.3.19", features = ["chrono", "env-filter", "json", "local-time", "parking_lot", "registry"] }
28 | whoami = "1.5"
29 | 
30 | [features]
31 | default = []
32 | fbcode_build = ["fbinit", "scuba"]
33 | 
34 | [lints]
35 | rust = { unexpected_cfgs = { check-cfg = ["cfg(fbcode_build)"], level = "warn" } }
36 | 


--------------------------------------------------------------------------------
/hyperactor_telemetry/src/otel.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #[allow(dead_code)]
10 | pub fn tracing_layer<
11 |     S: tracing::Subscriber + for<'span> tracing_subscriber::registry::LookupSpan<'span>,
12 | >() -> Option<impl tracing_subscriber::Layer<S>> {
13 |     #[cfg(fbcode_build)]
14 |     {
15 |         Some(crate::meta::tracing_layer())
16 |     }
17 |     #[cfg(not(fbcode_build))]
18 |     {
19 |         None::<Box<dyn tracing_subscriber::Layer<S> + Send + Sync>>
20 |     }
21 | }
22 | 
23 | #[allow(dead_code)]
24 | pub fn init_metrics() {
25 |     #[cfg(fbcode_build)]
26 |     {
27 |         opentelemetry::global::set_meter_provider(crate::meta::meter_provider());
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/hyperactor_telemetry/stubs/fbinit/src/lib.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | //! Stub implementation of fbinit for OSS builds
10 | //!
11 | //! This is a minimal implementation that provides the necessary API surface
12 | //! for code that depends on fbinit, but doesn't actually do anything.
13 | 
14 | /// A stub for the fbinit context
15 | #[derive(Clone, Copy, Debug)]
16 | pub struct FacebookInit;
17 | 
18 | /// A trait for types that require fbinit
19 | pub trait MainWithFbinit {
20 |     fn init_and_run(self, _fb: FacebookInit) -> i32;
21 | }
22 | 
23 | /// Initialize the Facebook runtime (stub implementation)
24 | pub fn initialize_with_client_logging(_args: &[&str]) -> FacebookInit {
25 |     FacebookInit
26 | }
27 | 
28 | /// Initialize the Facebook runtime (stub implementation)
29 | pub fn initialize() -> FacebookInit {
30 |     FacebookInit
31 | }
32 | 
33 | /// Run a function with fbinit (stub implementation)
34 | pub fn run_with_init<F, R>(f: F) -> R
35 | where
36 |     F: FnOnce(FacebookInit) -> R,
37 | {
38 |     f(FacebookInit)
39 | }
40 | 


--------------------------------------------------------------------------------
/hyperactor_telemetry/tester/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # @generated by autocargo from //monarch/hyperactor_telemetry/tester:tester
 2 | 
 3 | [package]
 4 | name = "tester"
 5 | version = "0.0.0"
 6 | authors = ["Meta"]
 7 | edition = "2021"
 8 | license = "BSD-3-Clause"
 9 | 
10 | [[bin]]
11 | name = "tester"
12 | path = "main.rs"
13 | 
14 | [dependencies]
15 | hyperactor_telemetry = { version = "0.0.0", path = ".." }
16 | opentelemetry = "0.29"
17 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
18 | 
19 | [lints]
20 | rust = { unexpected_cfgs = { check-cfg = ["cfg(fbcode_build)"], level = "warn" } }
21 | 


--------------------------------------------------------------------------------
/hyperactor_telemetry/tester/main.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | use hyperactor_telemetry::DefaultTelemetryClock;
10 | use hyperactor_telemetry::declare_static_counter;
11 | use hyperactor_telemetry::declare_static_gauge;
12 | use hyperactor_telemetry::declare_static_histogram;
13 | use hyperactor_telemetry::initialize_logging;
14 | 
15 | // Declare static metrics for testing
16 | declare_static_counter!(REQUEST_COUNT, "test_requests");
17 | declare_static_gauge!(MEMORY_USAGE, "test_memory_usage");
18 | declare_static_histogram!(REQUEST_DURATION, "test_request_duration");
19 | 
20 | #[tracing::instrument]
21 | fn something_an_actor_would_do() {
22 |     tracing::debug!("debug message");
23 | }
24 | 
25 | fn main() {
26 |     // Initialize logging with default configuration
27 |     initialize_logging(DefaultTelemetryClock {});
28 |     tracing::info!("info log");
29 | }
30 | 


--------------------------------------------------------------------------------
/monarch_extension/build.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | fn main() {
10 |     // Only set torch-related rpaths if tensor_engine feature is enabled
11 |     #[cfg(feature = "tensor_engine")]
12 |     {
13 |         // `torch-sys` will set this env var through Cargo `links` metadata.
14 |         let lib_path = std::env::var("DEP_TORCH_LIB_PATH").expect("DEP_TORCH_LIB_PATH to be set");
15 |         // Set the rpath so that the dynamic linker can find libtorch and friends.
16 |         println!("cargo::rustc-link-arg=-Wl,-rpath,{lib_path}");
17 | 
18 |         if let Ok(path) = std::env::var("DEP_NCCL_LIB_PATH") {
19 |             println!("cargo::rustc-link-arg=-Wl,-rpath,{path}");
20 |         }
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/monarch_extension/src/blocking.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | use libc::atexit;
10 | use pyo3::prelude::*;
11 | use tokio::time::Duration;
12 | 
13 | extern "C" fn exit_handler() {
14 |     loop {
15 |         #[allow(clippy::disallowed_methods)]
16 |         std::thread::sleep(Duration::from_secs(60));
17 |     }
18 | }
19 | 
20 | /// A function that blocks when called.
21 | /// This is used for testing stuck jobs in the Python bindings.
22 | #[pyfunction]
23 | pub fn blocking_function() {
24 |     // SAFETY:
25 |     // This is in order to simulate a process in tests that never exits.
26 |     unsafe {
27 |         atexit(exit_handler);
28 |     }
29 | }
30 | 
31 | /// Register Python bindings for the blocking module.
32 | pub fn register_python_bindings(module: &Bound<'_, PyModule>) -> PyResult<()> {
33 |     let f = wrap_pyfunction!(blocking_function, module)?;
34 |     f.setattr(
35 |         "__module__",
36 |         "monarch._rust_bindings.monarch_extension.blocking",
37 |     )?;
38 |     module.add_function(f)?;
39 |     Ok(())
40 | }
41 | 


--------------------------------------------------------------------------------
/monarch_extension/src/panic.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | use pyo3::prelude::*;
10 | 
11 | /// A function that panics when called.
12 | /// This is used for testing panic handling in the Python bindings.
13 | #[pyfunction]
14 | pub fn panicking_function() {
15 |     panic!("This is a deliberate panic from panicking_function");
16 | }
17 | 
18 | /// Register Python bindings for the panic module.
19 | pub fn register_python_bindings(module: &Bound<'_, PyModule>) -> PyResult<()> {
20 |     let f = wrap_pyfunction!(panicking_function, module)?;
21 |     f.setattr(
22 |         "__module__",
23 |         "monarch._rust_bindings.monarch_extension.panic",
24 |     )?;
25 |     module.add_function(f)?;
26 |     Ok(())
27 | }
28 | 


--------------------------------------------------------------------------------
/monarch_extension/src/simulation_tools.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | use hyperactor::clock::Clock;
10 | use hyperactor::clock::SimClock;
11 | use hyperactor::simnet;
12 | use pyo3::prelude::*;
13 | 
14 | #[pyfunction]
15 | #[pyo3(name = "start_event_loop")]
16 | pub fn start_simnet_event_loop(py: Python) -> PyResult<Bound<'_, PyAny>> {
17 |     pyo3_async_runtimes::tokio::future_into_py(py, async move {
18 |         simnet::start();
19 |         Ok(())
20 |     })
21 | }
22 | 
23 | #[pyfunction]
24 | #[pyo3(name="sleep",signature=(seconds))]
25 | pub fn py_sim_sleep<'py>(py: Python<'py>, seconds: f64) -> PyResult<Bound<'py, PyAny>> {
26 |     let millis = (seconds * 1000.0).ceil() as u64;
27 |     pyo3_async_runtimes::tokio::future_into_py(py, async move {
28 |         let duration = tokio::time::Duration::from_millis(millis);
29 |         SimClock.sleep(duration).await;
30 |         Ok(())
31 |     })
32 | }
33 | 
34 | pub(crate) fn register_python_bindings(simulation_tools_mod: &Bound<'_, PyModule>) -> PyResult<()> {
35 |     {
36 |         let f = wrap_pyfunction!(py_sim_sleep, simulation_tools_mod)?;
37 |         f.setattr(
38 |             "__module__",
39 |             "monarch._rust_bindings.monarch_extension.simulation_tools",
40 |         )?;
41 |         simulation_tools_mod.add_function(f)?;
42 |     }
43 |     {
44 |         let f = wrap_pyfunction!(start_simnet_event_loop, simulation_tools_mod)?;
45 |         f.setattr(
46 |             "__module__",
47 |             "monarch._rust_bindings.monarch_extension.simulation_tools",
48 |         )?;
49 |         simulation_tools_mod.add_function(f)?;
50 |     }
51 |     Ok(())
52 | }
53 | 


--------------------------------------------------------------------------------
/monarch_hyperactor/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # @generated by autocargo from //monarch/monarch_hyperactor:[monarch_hyperactor,process_allocator-oss]
 2 | 
 3 | [package]
 4 | name = "monarch_hyperactor"
 5 | version = "0.0.0"
 6 | authors = ["Meta"]
 7 | edition = "2021"
 8 | license = "BSD-3-Clause"
 9 | 
10 | [dependencies]
11 | anyhow = "1.0.98"
12 | async-once-cell = "0.4.2"
13 | async-trait = "0.1.86"
14 | bincode = "1.3.3"
15 | clap = { version = "4.5.38", features = ["derive", "env", "string", "unicode", "wrap_help"] }
16 | erased-serde = "0.3.27"
17 | fbinit = { version = "0.2.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main" }
18 | futures = { version = "0.3.30", features = ["async-await", "compat"] }
19 | futures-util = { version = "0.3.30", features = ["compat"] }
20 | hyperactor = { version = "0.0.0", path = "../hyperactor" }
21 | hyperactor_mesh = { version = "0.0.0", path = "../hyperactor_mesh" }
22 | hyperactor_multiprocess = { version = "0.0.0", path = "../hyperactor_multiprocess" }
23 | hyperactor_telemetry = { version = "0.0.0", path = "../hyperactor_telemetry" }
24 | inventory = "0.3.8"
25 | monarch_types = { version = "0.0.0", path = "../monarch_types" }
26 | ndslice = { version = "0.0.0", path = "../ndslice" }
27 | nix = { version = "0.29.0", features = ["dir", "event", "hostname", "inotify", "ioctl", "mman", "mount", "net", "poll", "ptrace", "reboot", "resource", "sched", "signal", "term", "time", "user", "zerocopy"] }
28 | pyo3 = { version = "0.24", features = ["anyhow", "multiple-pymethods"] }
29 | pyo3-async-runtimes = { version = "0.24", features = ["attributes", "tokio-runtime"] }
30 | serde = { version = "1.0.185", features = ["derive", "rc"] }
31 | serde_bytes = "0.11"
32 | tempfile = "3.15"
33 | thiserror = "2.0.12"
34 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] }
35 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
36 | 
37 | [dev-dependencies]
38 | dir-diff = "0.3"
39 | 


--------------------------------------------------------------------------------
/monarch_hyperactor/src/bin/process_allocator/main.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | mod common;
10 | 
11 | use std::str::FromStr;
12 | 
13 | use clap::Parser;
14 | use common::Args;
15 | use common::main_impl;
16 | use hyperactor::channel::ChannelAddr;
17 | 
18 | #[tokio::main]
19 | async fn main() {
20 |     let args = Args::parse();
21 |     hyperactor::initialize_with_current_runtime();
22 | 
23 |     let bind = args
24 |         .addr
25 |         .unwrap_or_else(|| format!("tcp![::]:{}", args.port));
26 | 
27 |     let serve_address = ChannelAddr::from_str(&bind).unwrap();
28 | 
29 |     let _ = main_impl(serve_address, args.program).await.unwrap();
30 | }
31 | 


--------------------------------------------------------------------------------
/monarch_hyperactor/src/bootstrap.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | use hyperactor_mesh::bootstrap_or_die;
10 | use pyo3::Bound;
11 | use pyo3::PyAny;
12 | use pyo3::PyResult;
13 | use pyo3::Python;
14 | use pyo3::pyfunction;
15 | use pyo3::types::PyAnyMethods;
16 | use pyo3::types::PyModule;
17 | use pyo3::types::PyModuleMethods;
18 | use pyo3::wrap_pyfunction;
19 | 
20 | #[pyfunction]
21 | #[pyo3(signature = ())]
22 | pub fn bootstrap_main(py: Python) -> PyResult<Bound<PyAny>> {
23 |     // SAFETY: this is a correct use of this function.
24 |     let _ = unsafe {
25 |         fbinit::perform_init();
26 |     };
27 | 
28 |     hyperactor::tracing::debug!("entering async bootstrap");
29 |     pyo3_async_runtimes::tokio::future_into_py::<_, ()>(py, async move {
30 |         // SAFETY:
31 |         // - Only one of these is ever created.
32 |         // - This is the entry point of this program, so this will be dropped when
33 |         // no more FB C++ code is running.
34 |         let _destroy_guard = unsafe { fbinit::DestroyGuard::new() };
35 |         bootstrap_or_die().await;
36 |     })
37 | }
38 | 
39 | pub fn register_python_bindings(hyperactor_mod: &Bound<'_, PyModule>) -> PyResult<()> {
40 |     let f = wrap_pyfunction!(bootstrap_main, hyperactor_mod)?;
41 |     f.setattr(
42 |         "__module__",
43 |         "monarch._rust_bindings.monarch_hyperactor.bootstrap",
44 |     )?;
45 |     hyperactor_mod.add_function(f)?;
46 | 
47 |     Ok(())
48 | }
49 | 


--------------------------------------------------------------------------------
/monarch_hyperactor/src/code_sync.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | pub mod manager;
10 | pub mod rsync;
11 | mod workspace;
12 | 
13 | pub use workspace::WorkspaceLocation;
14 | 


--------------------------------------------------------------------------------
/monarch_hyperactor/src/code_sync/workspace.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | use std::path::PathBuf;
10 | 
11 | use anyhow::Result;
12 | use serde::Deserialize;
13 | use serde::Serialize;
14 | 
15 | #[derive(Clone, Debug, Serialize, Deserialize)]
16 | pub enum WorkspaceLocation {
17 |     Constant(PathBuf),
18 |     FromEnvVar(String),
19 | }
20 | 
21 | impl WorkspaceLocation {
22 |     pub fn resolve(&self) -> Result<PathBuf> {
23 |         Ok(match self {
24 |             WorkspaceLocation::Constant(p) => p.clone(),
25 |             WorkspaceLocation::FromEnvVar(v) => PathBuf::from(std::env::var(v)?),
26 |         })
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/monarch_hyperactor/src/config.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | //! Configuration for Monarch Hyperactor.
10 | //!
11 | //! This module provides monarch-specific configuration attributes that extend
12 | //! the base hyperactor configuration system.
13 | 
14 | use hyperactor::attrs::declare_attrs;
15 | 
16 | // Declare monarch-specific configuration keys
17 | declare_attrs! {
18 |     /// Use a single asyncio runtime for all Python actors, rather than one per actor
19 |     pub attr SHARED_ASYNCIO_RUNTIME: bool = false;
20 | }
21 | 


--------------------------------------------------------------------------------
/monarch_hyperactor/src/lib.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #![allow(unsafe_op_in_unsafe_fn)]
10 | #![feature(exit_status_error)]
11 | 
12 | pub mod actor;
13 | pub mod actor_mesh;
14 | pub mod alloc;
15 | pub mod bootstrap;
16 | pub mod channel;
17 | pub mod code_sync;
18 | pub mod config;
19 | pub mod local_state_broker;
20 | pub mod mailbox;
21 | pub mod ndslice;
22 | pub mod proc;
23 | pub mod proc_mesh;
24 | pub mod runtime;
25 | pub mod selection;
26 | pub mod shape;
27 | pub mod supervision;
28 | pub mod telemetry;
29 | 
30 | #[cfg(fbcode_build)]
31 | pub mod meta;
32 | 


--------------------------------------------------------------------------------
/monarch_hyperactor/src/selection.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | use ndslice::selection::Selection;
10 | use pyo3::PyResult;
11 | use pyo3::prelude::*;
12 | use pyo3::types::PyType;
13 | 
14 | #[pyclass(
15 |     name = "Selection",
16 |     module = "monarch._rust_bindings.monarch_hyperactor.selection",
17 |     frozen
18 | )]
19 | pub struct PySelection {
20 |     inner: Selection,
21 | }
22 | 
23 | impl PySelection {
24 |     pub(crate) fn inner(&self) -> &Selection {
25 |         &self.inner
26 |     }
27 | }
28 | 
29 | impl From<Selection> for PySelection {
30 |     fn from(inner: Selection) -> Self {
31 |         Self { inner }
32 |     }
33 | }
34 | 
35 | #[pymethods]
36 | impl PySelection {
37 |     #[getter]
38 |     fn __repr__(&self) -> String {
39 |         format!("{:?}", self.inner)
40 |     }
41 | 
42 |     #[classmethod]
43 |     #[pyo3(name = "from_string")]
44 |     pub fn parse(_cls: Bound<'_, PyType>, input: &str) -> PyResult<Self> {
45 |         let selection = ndslice::selection::parse::parse(input).map_err(|err| {
46 |             pyo3::exceptions::PyValueError::new_err(format!("parse error: {err}"))
47 |         })?;
48 | 
49 |         Ok(PySelection::from(selection))
50 |     }
51 | }
52 | 
53 | pub fn register_python_bindings(module: &Bound<'_, PyModule>) -> PyResult<()> {
54 |     module.add_class::<PySelection>()?;
55 |     Ok(())
56 | }
57 | 


--------------------------------------------------------------------------------
/monarch_hyperactor/src/supervision.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | use pyo3::create_exception;
10 | use pyo3::exceptions::PyRuntimeError;
11 | use pyo3::prelude::*;
12 | 
13 | create_exception!(
14 |     monarch._rust_bindings.monarch_hyperactor.supervision,
15 |     SupervisionError,
16 |     PyRuntimeError
17 | );
18 | 
19 | pub fn register_python_bindings(module: &Bound<'_, PyModule>) -> PyResult<()> {
20 |     // Get the Python interpreter instance from the module
21 |     let py = module.py();
22 |     // Add the exception to the module using its type object
23 |     module.add("SupervisionError", py.get_type::<SupervisionError>())?;
24 |     Ok(())
25 | }
26 | 


--------------------------------------------------------------------------------
/monarch_messages/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # @generated by autocargo from //monarch/monarch_messages:monarch_messages
 2 | 
 3 | [package]
 4 | name = "monarch_messages"
 5 | version = "0.0.0"
 6 | authors = ["Meta"]
 7 | edition = "2021"
 8 | license = "BSD-3-Clause"
 9 | 
10 | [dependencies]
11 | anyhow = "1.0.98"
12 | derive_more = { version = "1.0.0", features = ["full"] }
13 | enum-as-inner = "0.6.0"
14 | hyperactor = { version = "0.0.0", path = "../hyperactor" }
15 | monarch_types = { version = "0.0.0", path = "../monarch_types" }
16 | ndslice = { version = "0.0.0", path = "../ndslice" }
17 | pyo3 = { version = "0.24", features = ["anyhow", "multiple-pymethods"] }
18 | serde = { version = "1.0.185", features = ["derive", "rc"] }
19 | serde_bytes = "0.11"
20 | thiserror = "2.0.12"
21 | torch-sys = { version = "0.0.0", path = "../torch-sys" }
22 | torch-sys-cuda = { version = "0.0.0", path = "../torch-sys-cuda" }
23 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
24 | 
25 | [dev-dependencies]
26 | paste = "1.0.14"
27 | 


--------------------------------------------------------------------------------
/monarch_messages/build.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | fn main() {
10 |     // `torch-sys` will set this env var through Cargo `links` metadata.
11 |     let lib_path = std::env::var("DEP_TORCH_LIB_PATH").expect("DEP_TORCH_LIB_PATH to be set");
12 |     // Set the rpath so that the dynamic linker can find libtorch and friends.
13 |     println!("cargo::rustc-link-arg=-Wl,-rpath,{lib_path}");
14 | }
15 | 


--------------------------------------------------------------------------------
/monarch_messages/src/debugger.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // NOTE: Until https://github.com/PyO3/pyo3/pull/4674, `pyo3::pymethods` trigger
10 | // and unsafe-op-in-unsafe-fn warnings.
11 | #![allow(unsafe_op_in_unsafe_fn)]
12 | 
13 | use derive_more::From;
14 | use hyperactor::Handler;
15 | use hyperactor::Named;
16 | use pyo3::Bound;
17 | use pyo3::PyResult;
18 | use pyo3::types::PyModule;
19 | use pyo3::types::PyModuleMethods;
20 | use serde::Deserialize;
21 | use serde::Serialize;
22 | 
23 | pub fn register_python_bindings(debugger: &Bound<'_, PyModule>) -> PyResult<()> {
24 |     debugger.add_class::<DebuggerAction>()?;
25 |     Ok(())
26 | }
27 | 
28 | /// Enumerates the actions relevant to PDB debugging sessions.
29 | #[derive(Debug, Deserialize, Clone, Serialize, PartialEq)]
30 | #[pyo3::pyclass(frozen, module = "monarch._rust_bindings.monarch_messages.debugger")]
31 | pub enum DebuggerAction {
32 |     /// Sent from worker to client to indicate that the worker has entered
33 |     /// a pdb debugging session.
34 |     Paused(),
35 | 
36 |     /// Sent from client to worker to indicate that the client has started
37 |     /// the debugging session.
38 |     Attach(),
39 | 
40 |     /// Sent to client or to worker to end the debugging session.
41 |     Detach(),
42 | 
43 |     /// Sent to client or to worker to write bytes to receiver's stdout.
44 |     Write {
45 |         #[serde(with = "serde_bytes")]
46 |         bytes: Vec<u8>,
47 |     },
48 | 
49 |     /// Sent from worker to client to read bytes from client's stdin.
50 |     Read { requested_size: usize },
51 | }
52 | 
53 | #[derive(Serialize, Deserialize, Debug, Clone, Named, From, Handler)]
54 | pub enum DebuggerMessage {
55 |     Action { action: DebuggerAction },
56 | }
57 | 
58 | hyperactor::alias!(
59 |     DebuggerActor,
60 |     DebuggerMessage { cast = true },
61 | );
62 | 


--------------------------------------------------------------------------------
/monarch_messages/src/lib.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #![feature(assert_matches)]
10 | 
11 | pub mod client;
12 | pub mod controller;
13 | pub mod debugger;
14 | pub mod wire_value;
15 | pub mod worker;
16 | 


--------------------------------------------------------------------------------
/monarch_messages/test_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import math
 8 | 
 9 | 
10 | def has_nan(t):
11 |     return math.isnan(t)
12 | 


--------------------------------------------------------------------------------
/monarch_rdma/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # @generated by autocargo from //monarch/monarch_rdma:monarch_rdma
 2 | 
 3 | [package]
 4 | name = "monarch_rdma"
 5 | version = "0.0.0"
 6 | authors = ["Meta"]
 7 | edition = "2021"
 8 | license = "BSD-3-Clause"
 9 | 
10 | [dependencies]
11 | anyhow = "1.0.98"
12 | async-trait = "0.1.86"
13 | cuda-sys = { path = "../cuda-sys" }
14 | hyperactor = { version = "0.0.0", path = "../hyperactor" }
15 | rand = { version = "0.8", features = ["small_rng"] }
16 | rdmacore-sys = { path = "../rdmacore-sys" }
17 | serde = { version = "1.0.185", features = ["derive", "rc"] }
18 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
19 | 
20 | [dev-dependencies]
21 | hyperactor_mesh = { version = "0.0.0", path = "../hyperactor_mesh" }
22 | ndslice = { version = "0.0.0", path = "../ndslice" }
23 | timed_test = { version = "0.0.0", path = "../timed_test" }
24 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] }
25 | 
26 | [features]
27 | cuda = []
28 | default = ["cuda"]
29 | 


--------------------------------------------------------------------------------
/monarch_rdma/examples/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # @generated by autocargo from //monarch/monarch_rdma/examples:[parameter_server,parameter_server_bootstrap,parameter_server_example]
 2 | 
 3 | [package]
 4 | name = "parameter_server"
 5 | version = "0.0.0"
 6 | authors = ["Meta"]
 7 | edition = "2021"
 8 | license = "BSD-3-Clause"
 9 | 
10 | [lib]
11 | path = "parameter_server.rs"
12 | 
13 | [[bin]]
14 | name = "parameter_server_bootstrap"
15 | path = "bootstrap.rs"
16 | test = false
17 | 
18 | [[bin]]
19 | name = "parameter_server_example"
20 | path = "main.rs"
21 | test = false
22 | 
23 | [dependencies]
24 | anyhow = "1.0.98"
25 | async-trait = "0.1.86"
26 | buck-resources = "1"
27 | hyperactor = { version = "0.0.0", path = "../../hyperactor" }
28 | hyperactor_mesh = { version = "0.0.0", path = "../../hyperactor_mesh" }
29 | monarch_rdma = { version = "0.0.0", path = ".." }
30 | ndslice = { version = "0.0.0", path = "../../ndslice" }
31 | serde = { version = "1.0.185", features = ["derive", "rc"] }
32 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] }
33 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
34 | tracing-subscriber = { version = "0.3.19", features = ["chrono", "env-filter", "json", "local-time", "parking_lot", "registry"] }
35 | 
36 | [dev-dependencies]
37 | timed_test = { version = "0.0.0", path = "../../timed_test" }
38 | 


--------------------------------------------------------------------------------
/monarch_rdma/examples/bootstrap.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #![allow(unused)]
10 | use std::hint::black_box;
11 | 
12 | use monarch_rdma::RdmaManagerActor;
13 | use parameter_server::ParameterServerActor;
14 | use parameter_server::WorkerActor;
15 | 
16 | /// This is an "empty shell" bootstrap process,
17 | /// simply invoking [`hyperactor_mesh::bootstrap_or_die`].
18 | #[tokio::main]
19 | async fn main() {
20 |     tracing_subscriber::fmt()
21 |         .with_max_level(tracing::Level::INFO)
22 |         .init();
23 |     // The following black_box lines force-link the actors needed for the parameter server
24 |     // example to run. Relying on side-effects for actor registration is not consistent across
25 |     // all build modes.
26 |     let _ = black_box::<Option<RdmaManagerActor>>(None);
27 |     let _ = black_box::<Option<ParameterServerActor>>(None);
28 |     let _ = black_box::<Option<WorkerActor>>(None);
29 |     hyperactor_mesh::bootstrap_or_die().await;
30 | }
31 | 


--------------------------------------------------------------------------------
/monarch_rdma/examples/main.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | //! Main running script for parameter server example.
10 | //!
11 | //! This script needs to be kept separate to avoid buck naming collisions.
12 | //!
13 | //! Specifically, parameter_server::run uses ProcAllocator, which spawns
14 | //! the binary defined in //monarch/examples/rdma/bootstrap.rs.
15 | //!
16 | //! If this main script was kept in the same file as parameter_server.rs, then
17 | //! spawning the actors defined in parameter_server would be named e.g.
18 | //! "parameter_server_example::ParameterServerActor", whereas the bootstrap binary
19 | //! expects this to be named "parameter_server::ParameterServerActor".
20 | //!
21 | //! Keeping this file separate allows us to avoid this naming collision.
22 | use parameter_server::run;
23 | 
24 | #[tokio::main]
25 | async fn main() -> Result<(), anyhow::Error> {
26 |     run(4, 5).await
27 | }
28 | 


--------------------------------------------------------------------------------
/monarch_rdma/extension/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # @generated by autocargo from //monarch/monarch_rdma/extension:monarch_rdma_extension
 2 | 
 3 | [package]
 4 | name = "monarch_rdma_extension"
 5 | version = "0.0.0"
 6 | authors = ["Meta"]
 7 | edition = "2021"
 8 | license = "BSD-3-Clause"
 9 | 
10 | [lib]
11 | path = "lib.rs"
12 | test = false
13 | doctest = false
14 | 
15 | [dependencies]
16 | hyperactor = { version = "0.0.0", path = "../../hyperactor" }
17 | hyperactor_mesh = { version = "0.0.0", path = "../../hyperactor_mesh" }
18 | monarch_hyperactor = { version = "0.0.0", path = "../../monarch_hyperactor" }
19 | monarch_rdma = { version = "0.0.0", path = ".." }
20 | pyo3 = { version = "0.24", features = ["anyhow", "multiple-pymethods"] }
21 | pyo3-async-runtimes = { version = "0.24", features = ["attributes", "tokio-runtime"] }
22 | serde = { version = "1.0.185", features = ["derive", "rc"] }
23 | serde_json = { version = "1.0.140", features = ["alloc", "float_roundtrip", "unbounded_depth"] }
24 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
25 | 


--------------------------------------------------------------------------------
/monarch_rdma/src/lib.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | // RDMA requires frequent unsafe code blocks
10 | #![allow(clippy::undocumented_unsafe_blocks)]
11 | 
12 | mod ibverbs_primitives;
13 | mod rdma_components;
14 | mod rdma_manager_actor;
15 | mod test_utils;
16 | 
17 | #[macro_use]
18 | mod macros;
19 | 
20 | pub use ibverbs_primitives::*;
21 | pub use rdma_components::*;
22 | pub use rdma_manager_actor::*;
23 | 


--------------------------------------------------------------------------------
/monarch_rdma/src/macros.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #[macro_export]
10 | macro_rules! cu_check {
11 |     ($result:expr) => {
12 |         if $result != cuda_sys::CUresult::CUDA_SUCCESS {
13 |             let mut error_string: *const i8 = std::ptr::null();
14 |             cuda_sys::cuGetErrorString($result, &mut error_string);
15 |             panic!(
16 |                 "cuda failure {}:{} {:?} '{}'",
17 |                 file!(),
18 |                 line!(),
19 |                 $result,
20 |                 std::ffi::CStr::from_ptr(error_string).to_string_lossy()
21 |             );
22 |         }
23 |     };
24 | }
25 | 


--------------------------------------------------------------------------------
/monarch_simulator/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # @generated by autocargo from //monarch/monarch_simulator:monarch_simulator_lib
 2 | 
 3 | [package]
 4 | name = "monarch_simulator_lib"
 5 | version = "0.0.0"
 6 | authors = ["Meta"]
 7 | edition = "2021"
 8 | license = "BSD-3-Clause"
 9 | 
10 | [dependencies]
11 | anyhow = "1.0.98"
12 | async-trait = "0.1.86"
13 | controller = { version = "0.0.0", path = "../controller" }
14 | dashmap = { version = "5.5.3", features = ["rayon", "serde"] }
15 | futures = { version = "0.3.30", features = ["async-await", "compat"] }
16 | hyperactor = { version = "0.0.0", path = "../hyperactor" }
17 | hyperactor_multiprocess = { version = "0.0.0", path = "../hyperactor_multiprocess" }
18 | lazy_static = "1.5"
19 | monarch_messages = { version = "0.0.0", path = "../monarch_messages" }
20 | monarch_tensor_worker = { version = "0.0.0", path = "../monarch_tensor_worker" }
21 | monarch_types = { version = "0.0.0", path = "../monarch_types" }
22 | ndslice = { version = "0.0.0", path = "../ndslice" }
23 | serde = { version = "1.0.185", features = ["derive", "rc"] }
24 | serde_json = { version = "1.0.140", features = ["alloc", "float_roundtrip", "unbounded_depth"] }
25 | thiserror = "2.0.12"
26 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] }
27 | torch-sys = { version = "0.0.0", path = "../torch-sys" }
28 | torch-sys-cuda = { version = "0.0.0", path = "../torch-sys-cuda" }
29 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
30 | 
31 | [dev-dependencies]
32 | tracing-test = { version = "0.2.3", features = ["no-env-filter"] }
33 | 


--------------------------------------------------------------------------------
/monarch_simulator/src/lib.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | use hyperactor::actor::ActorError;
10 | use hyperactor::simnet::SimNetError;
11 | 
12 | pub mod bootstrap;
13 | mod collective_coordinator;
14 | pub mod controller;
15 | pub mod simulator;
16 | pub mod worker;
17 | 
18 | /// The type of error that can occur on channel operations.
19 | #[derive(thiserror::Error, Debug)]
20 | pub enum SimulatorError {
21 |     /// Error during simnet operation.
22 |     #[error(transparent)]
23 |     SimNetError(#[from] SimNetError),
24 | 
25 |     /// Error during actor operations.
26 |     #[error(transparent)]
27 |     ActorError(#[from] ActorError),
28 | 
29 |     /// Simulator cannot find the world with given name.
30 |     #[error("World {0} not found")]
31 |     WorldNotFound(String),
32 | 
33 |     /// Cannot find the mesh in simulator.
34 |     #[error("Mesh not found {0}")]
35 |     MeshNotFound(String),
36 | }
37 | 


--------------------------------------------------------------------------------
/monarch_tensor_worker/build.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | fn main() {
10 |     // `torch-sys` will set this env var through Cargo `links` metadata.
11 |     let lib_path = std::env::var("DEP_TORCH_LIB_PATH").expect("DEP_TORCH_LIB_PATH to be set");
12 |     // Set the rpath so that the dynamic linker can find libtorch and friends.
13 |     println!("cargo::rustc-link-arg=-Wl,-rpath,{lib_path}");
14 | }
15 | 


--------------------------------------------------------------------------------
/monarch_tensor_worker/test_worker_main.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | """
 8 | Simplified version of worker_main.py for testing the monarch_tensor_worker standalone.
 9 | 
10 | We want a Python entrypoint here because we want to initialize the Monarch
11 | Python extension on the main thread.
12 | """
13 | 
14 | 
15 | def main() -> None:
16 |     # torch is import to make sure all the dynamic types are registered
17 |     import torch  # noqa
18 | 
19 |     # Force CUDA initialization early on. CUDA init is lazy, and Python CUDA
20 |     # APIs are guarded to init CUDA if necessary. But our worker calls
21 |     # raw libtorch APIs which are not similarly guarded. So just initialize here
22 |     # to avoid issues with potentially using uninitialized CUDA state.
23 |     torch.cuda.init()
24 | 
25 |     from monarch._rust_bindings.monarch_extension import (  # @manual=//monarch/monarch_extension:monarch_extension
26 |         tensor_worker,
27 |     )
28 | 
29 |     # pyre-ignore[16]
30 |     tensor_worker.worker_main()
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     # Do not add code here, it won't be run. Add them to the function called below.
35 |     main()  # pragma: no cover
36 | 


--------------------------------------------------------------------------------
/monarch_types/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # @generated by autocargo from //monarch/monarch_types:monarch_types
 2 | 
 3 | [package]
 4 | name = "monarch_types"
 5 | version = "0.0.0"
 6 | authors = ["Meta"]
 7 | edition = "2021"
 8 | license = "BSD-3-Clause"
 9 | 
10 | [dependencies]
11 | derive_more = { version = "1.0.0", features = ["full"] }
12 | hyperactor = { version = "0.0.0", path = "../hyperactor" }
13 | pyo3 = { version = "0.24", features = ["anyhow", "multiple-pymethods"] }
14 | serde = { version = "1.0.185", features = ["derive", "rc"] }
15 | serde_bytes = "0.11"
16 | 
17 | [dev-dependencies]
18 | anyhow = "1.0.98"
19 | timed_test = { version = "0.0.0", path = "../timed_test" }
20 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] }
21 | 


--------------------------------------------------------------------------------
/monarch_types/src/lib.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #![feature(assert_matches)]
10 | 
11 | mod pyobject;
12 | mod python;
13 | mod pytree;
14 | 
15 | pub use pyobject::PickledPyObject;
16 | pub use python::SerializablePyErr;
17 | pub use python::TryIntoPyObjectUnsafe;
18 | pub use pytree::PyTree;
19 | 


--------------------------------------------------------------------------------
/nccl-sys/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "nccl-sys"
 3 | version = "0.0.0"
 4 | authors = ["Facebook"]
 5 | edition = "2021"
 6 | license = "MIT"
 7 | links = "nccl"
 8 | 
 9 | [dependencies]
10 | cxx = "1.0.119"
11 | serde = { version = "1.0.185", features = ["derive", "rc"] }
12 | 
13 | [build-dependencies]
14 | bindgen = "0.70.1"
15 | which = "6.0.3"
16 | glob = "0.3.1"
17 | 


--------------------------------------------------------------------------------
/nccl-sys/src/nccl.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #pragma once
10 | 
11 | #include <nccl.h>
12 | 


--------------------------------------------------------------------------------
/ndslice/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # @generated by autocargo from //monarch/ndslice:ndslice
 2 | 
 3 | [package]
 4 | name = "ndslice"
 5 | version = "0.0.0"
 6 | authors = ["Facebook <opensource+crates-ndslice@fb.com>"]
 7 | edition = "2021"
 8 | description = "data structures to support n-d arrays of ranks"
 9 | repository = "https://github.com/pytorch-labs/monarch/"
10 | license = "BSD-3-Clause"
11 | 
12 | [dependencies]
13 | anyhow = "1.0.98"
14 | enum-as-inner = "0.6.0"
15 | itertools = "0.14.0"
16 | nom = "8"
17 | proc-macro2 = { version = "1.0.70", features = ["span-locations"] }
18 | quote = "1.0.29"
19 | rand = { version = "0.8", features = ["small_rng"] }
20 | serde = { version = "1.0.185", features = ["derive", "rc"] }
21 | thiserror = "2.0.12"
22 | 
23 | [dev-dependencies]
24 | proptest = "1.5"
25 | 


--------------------------------------------------------------------------------
/ndslice/src/lib.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | //! Core mesh components for the hyperactor framework.
10 | //!
11 | //! Provides [`Slice`], a compact representation of a subset of a
12 | //! multidimensional array. See [`Slice`] for more details.
13 | //!
14 | //! This crate defines the foundational abstractions used in
15 | //! hyperactor's mesh layer, including multidimensional shapes and
16 | //! selection algebra. The crate avoids dependencies on procedural
17 | //! macros and other higher-level constructs, enabling reuse in both
18 | //! runtime and macro contexts.
19 | 
20 | #![feature(assert_matches)]
21 | #![recursion_limit = "512"]
22 | 
23 | mod slice;
24 | pub use slice::DimSliceIterator;
25 | pub use slice::Slice;
26 | pub use slice::SliceError;
27 | pub use slice::SliceIterator;
28 | 
29 | /// Selection algebra for describing multidimensional mesh regions.
30 | pub mod selection;
31 | 
32 | /// Core types for representing multidimensional shapes and strides.
33 | pub mod shape;
34 | 
35 | /// Reshaping transformations for multidimensional slices and shapes.
36 | pub mod reshape;
37 | 
38 | /// The selection expression type used to define routing constraints.
39 | pub use selection::Selection;
40 | /// DSL-style constructors for building `Selection` expressions.
41 | pub use selection::dsl;
42 | /// Represents an interval with an optional end and step, used to
43 | /// define extents in `Shape` and coordinate filters in `Selection`.
44 | pub use shape::Range;
45 | /// Describes the size and layout of a multidimensional mesh.
46 | pub use shape::Shape;
47 | /// Errors that can occur during shape construction or validation.
48 | pub use shape::ShapeError;
49 | 
50 | /// Property-based generators for randomized test input.
51 | #[cfg(test)]
52 | pub mod strategy;
53 | 
54 | /// Utilities.
55 | pub mod utils;
56 | 


--------------------------------------------------------------------------------
/preempt_rwlock/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # @generated by autocargo from //monarch/preempt_rwlock:preempt_rwlock
 2 | 
 3 | [package]
 4 | name = "preempt_rwlock"
 5 | version = "0.0.0"
 6 | authors = ["Meta"]
 7 | edition = "2021"
 8 | license = "BSD-3-Clause"
 9 | 
10 | [dependencies]
11 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] }
12 | 
13 | [dev-dependencies]
14 | anyhow = "1.0.98"
15 | futures = { version = "0.3.30", features = ["async-await", "compat"] }
16 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.pytest.ini_options]
2 | 
3 | markers = [
4 |     "oss_skip: marks tests to skip in OSS CI",
5 | ]
6 | asyncio_mode = "auto"
7 | # Default timeout of 5 minutes
8 | timeout = 300
9 | 


--------------------------------------------------------------------------------
/python/monarch/_rust_bindings/__init__.pyi:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the BSD-style license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 
7 | # True iff the rust extension was built with the tensor engine feature.
8 | def has_tensor_engine() -> bool: ...
9 | 


--------------------------------------------------------------------------------
/python/monarch/_rust_bindings/monarch_extension/__init__.pyi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/monarch/27b2e0d942ab98fc9c3e74f4ea512d8f28d515b8/python/monarch/_rust_bindings/monarch_extension/__init__.pyi


--------------------------------------------------------------------------------
/python/monarch/_rust_bindings/monarch_extension/blocking.pyi:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the BSD-style license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 
7 | def blocking_function() -> None: ...
8 | 


--------------------------------------------------------------------------------
/python/monarch/_rust_bindings/monarch_extension/code_sync.pyi:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from pathlib import Path
 8 | from typing import final
 9 | 
10 | from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh
11 | 
12 | from monarch._rust_bindings.monarch_hyperactor.shape import Shape
13 | 
14 | class WorkspaceLocation:
15 |     """
16 |     Python binding for the Rust WorkspaceLocation enum.
17 |     """
18 |     @final
19 |     class Constant(WorkspaceLocation):
20 |         def __init__(self, path) -> None: ...
21 | 
22 |     @final
23 |     class FromEnvVar(WorkspaceLocation):
24 |         def __init__(self, var) -> None: ...
25 | 
26 |     def resolve(self) -> Path:
27 |         """
28 |         Resolve the workspace location to a Path.
29 |         """
30 |         ...
31 | 
32 | @final
33 | class WorkspaceShape:
34 |     """
35 |     Python binding for the Rust WorkspaceShape struct.
36 |     """
37 |     @staticmethod
38 |     def shared(label: str) -> "WorkspaceShape": ...
39 |     @staticmethod
40 |     def exclusive() -> "WorkspaceShape": ...
41 | 
42 | @final
43 | class RemoteWorkspace:
44 |     """
45 |     Python binding for the Rust RemoteWorkspace struct.
46 |     """
47 |     def __init__(self, location: WorkspaceLocation, shape: WorkspaceShape) -> None: ...
48 | 
49 | @final
50 | class CodeSyncMeshClient:
51 |     """
52 |     Python binding for the Rust CodeSyncMeshClient.
53 |     """
54 |     @staticmethod
55 |     def spawn_blocking(
56 |         proc_mesh: ProcMesh,
57 |     ) -> CodeSyncMeshClient: ...
58 |     async def sync_workspace(
59 |         self,
60 |         *,
61 |         local: str,
62 |         remote: RemoteWorkspace,
63 |         auto_reload: bool = False,
64 |     ) -> None: ...
65 |     async def sync_workspaces(
66 |         self,
67 |         *,
68 |         workspaces: list[tuple[str, RemoteWorkspace]],
69 |         auto_reload: bool = False,
70 |     ) -> None: ...
71 | 


--------------------------------------------------------------------------------
/python/monarch/_rust_bindings/monarch_extension/logging.pyi:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | 
 9 | from typing import final
10 | 
11 | from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh
12 | 
13 | @final
14 | class LoggingMeshClient:
15 |     """
16 |     Python binding for the Rust LoggingMeshClient.
17 |     """
18 |     @staticmethod
19 |     async def spawn(
20 |         proc_mesh: ProcMesh,
21 |     ) -> LoggingMeshClient: ...
22 |     def set_mode(self, stream_to_client: bool) -> None: ...
23 | 


--------------------------------------------------------------------------------
/python/monarch/_rust_bindings/monarch_extension/mesh_controller.pyi:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from traceback import FrameSummary
 8 | from typing import List, NamedTuple, Sequence, Tuple, Union
 9 | 
10 | from monarch._rust_bindings.monarch_extension import client
11 | from monarch._rust_bindings.monarch_hyperactor.mailbox import PortId
12 | from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
13 | from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh
14 | 
15 | from monarch._rust_bindings.monarch_hyperactor.shape import Slice as NDSlice
16 | 
17 | class _Controller:
18 |     def __init__(self) -> None: ...
19 |     def node(
20 |         self,
21 |         seq: int,
22 |         defs: Sequence[object],
23 |         uses: Sequence[object],
24 |         port: Tuple[PortId, NDSlice] | None,
25 |         tracebacks: List[List[FrameSummary]],
26 |     ) -> None: ...
27 |     def drop_refs(self, refs: Sequence[object]) -> None: ...
28 |     def send(
29 |         self,
30 |         ranks: Union[NDSlice, List[NDSlice]],
31 |         msg: NamedTuple,
32 |     ) -> None: ...
33 |     def _drain_and_stop(
34 |         self,
35 |     ) -> List[client.LogMessage | client.WorkerResponse | client.DebuggerMessage]: ...
36 |     def sync_at_exit(self, port: PortId) -> None:
37 |         """
38 |         Controller waits until all nodes that were added are complete, then replies on the
39 |         given port. The port will get an exception if there was a known error that was not reported
40 |         to any future.
41 |         """
42 |         ...
43 | 
44 |     @property
45 |     def broker_id(self) -> Tuple[str, int]: ...
46 | 


--------------------------------------------------------------------------------
/python/monarch/_rust_bindings/monarch_extension/panic.pyi:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the BSD-style license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 
7 | def panicking_function() -> None: ...
8 | 


--------------------------------------------------------------------------------
/python/monarch/_rust_bindings/monarch_extension/simulation_tools.pyi:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | async def sleep(seconds: float) -> None:
 8 |     """
 9 |     Asyncio friendly sleep that waits for the simulator event loop to wake up
10 |     """
11 |     ...
12 | 
13 | async def start_event_loop() -> None:
14 |     """
15 |     Starts the simulator event loop
16 |     """
17 |     ...
18 | 


--------------------------------------------------------------------------------
/python/monarch/_rust_bindings/monarch_extension/simulator_client.pyi:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from typing import final
 8 | 
 9 | @final
10 | class SimulatorClient:
11 |     """
12 |     A wrapper around [simulator_client::Simulatorclient] to expose it to python.
13 |     It is a client to communicate with the simulator service.
14 | 
15 |     Arguments:
16 |     - `system_addr`: Address of the system.
17 |     - `world_size`: Number of workers in a given mesh.
18 |     """
19 | 
20 |     def __init__(self, system_addr: str, world_size: int) -> None: ...
21 |     def kill_world(self, world_name: str) -> None:
22 |         """
23 |         Kill the world with the given name.
24 | 
25 |         Arguments:
26 |         - `world_name`: Name of the world to kill.
27 |         """
28 |         ...
29 |     def spawn_mesh(
30 |         self, system_addr: str, controller_actor_id: str, worker_world: str
31 |     ) -> None:
32 |         """
33 |         Spawn a mesh actor.
34 | 
35 |         Arguments:
36 |         - `system_addr`: Address of the system to spawn the mesh in.
37 |         - `controller_actor_id`: Actor id of the controller to spawn the mesh in.
38 |         - `worker_world`: World of the worker to spawn the mesh in.
39 |         """
40 |         ...
41 | 
42 |     def set_training_script_state_running(self) -> None:
43 |         """
44 |         Let the simulator know that the training script is actively sending
45 |         commands to the backend
46 |         """
47 |         ...
48 | 
49 |     def set_training_script_state_waiting(self) -> None:
50 |         """
51 |         Let the simulator know that the training script is waiting for the
52 |         backend to resolve a future
53 |         """
54 |         ...
55 | 


--------------------------------------------------------------------------------
/python/monarch/_rust_bindings/monarch_hyperactor/bootstrap.pyi:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | 
 9 | def bootstrap_main() -> None: ...
10 | 


--------------------------------------------------------------------------------
/python/monarch/_rust_bindings/monarch_hyperactor/channel.pyi:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | 
 9 | from enum import Enum
10 | 
11 | class ChannelTransport(Enum):
12 |     Tcp = "tcp"
13 |     MetaTls = "metatls"
14 |     Local = "local"
15 |     Unix = "unix"
16 |     # Sim  # TODO add support
17 | 
18 | class ChannelAddr:
19 |     @staticmethod
20 |     def any(transport: ChannelTransport) -> str:
21 |         """Returns an "any" address for the given transport type.
22 | 
23 |         Primarily used to bind servers. The returned string can be
24 |         converted into `hyperactor::channel::ChannelAddr` (in Rust) by
25 |         calling `hyperactor::channel::ChannelAddr::from_str()`.
26 |         """
27 |         ...
28 | 


--------------------------------------------------------------------------------
/python/monarch/_rust_bindings/monarch_hyperactor/runtime.pyi:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | 
 9 | """
10 | Type hints for the runtime module.
11 | """
12 | 
13 | def sleep_indefinitely_for_unit_tests() -> None:
14 |     """
15 |     A test function that sleeps indefinitely in a loop.
16 |     This is used for testing signal handling in signal_safe_block_on.
17 |     The function will sleep forever until interrupted by a signal.
18 | 
19 |     Raises:
20 |         KeyboardInterrupt: When interrupted by a signal like SIGINT
21 |     """
22 |     ...
23 | 


--------------------------------------------------------------------------------
/python/monarch/_rust_bindings/monarch_hyperactor/selection.pyi:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | 
 9 | from typing import final
10 | 
11 | @final
12 | class Selection:
13 |     """Opaque representation of a selection expression used to represent
14 |     constraints over multidimensional shapes.
15 | 
16 |     Construct via from_string()` and use with mesh APIs to filter,
17 |     evaluate, or route over structured topologies.
18 |     """
19 |     def __repr__(self) -> str: ...
20 |     @classmethod
21 |     def from_string(cls, s: str) -> Selection:
22 |         """Parse a selection expression from a string.
23 | 
24 |         Accepts a compact string syntax such as `"(*, 0:4)"` or `"0 & (1 | 2)"`,
25 |             and returns a structured Selection object.
26 | 
27 |         Raises:
28 |             ValueError: if the input string is not a valid selection expression.
29 |         """
30 |         ...
31 | 


--------------------------------------------------------------------------------
/python/monarch/_rust_bindings/monarch_hyperactor/supervision.pyi:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from typing import final
 8 | 
 9 | @final
10 | class SupervisionError(RuntimeError):
11 |     """
12 |     Custom exception for supervision-related errors in monarch_hyperactor.
13 |     """
14 | 
15 |     ...
16 | 


--------------------------------------------------------------------------------
/python/monarch/_rust_bindings/monarch_messages/debugger.pyi:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from typing import final, Union
 8 | 
 9 | @final
10 | class DebuggerAction:
11 |     """Enum representing actions for the debugger communication between worker and client."""
12 | 
13 |     class Paused:
14 |         """
15 |         Sent from worker to client to indicate that the worker has entered
16 |         a pdb debugging session.
17 |         """
18 | 
19 |         pass
20 | 
21 |     class Attach:
22 |         """
23 |         Sent from client to worker to indicate that the client has started
24 |         the debugging session.
25 |         """
26 | 
27 |         pass
28 | 
29 |     class Detach:
30 |         """Sent to client or to worker to end the debugging session."""
31 | 
32 |         pass
33 | 
34 |     class Write:
35 |         """Sent to client or to worker to write bytes to receiver's stdout."""
36 | 
37 |         def __init__(self, bytes: bytes) -> None: ...
38 | 
39 |     class Read:
40 |         """Sent from worker to client to read bytes from client's stdin."""
41 | 
42 |         def __init__(self, requested_size: int) -> None: ...
43 |         @property
44 |         def requested_size(self) -> int:
45 |             """Get the number of bytes to read from stdin."""
46 |             ...
47 | 
48 | DebuggerActionType = Union[
49 |     DebuggerAction.Paused,
50 |     DebuggerAction.Attach,
51 |     DebuggerAction.Detach,
52 |     DebuggerAction.Read,
53 |     DebuggerAction.Write,
54 | ]
55 | 


--------------------------------------------------------------------------------
/python/monarch/_src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/monarch/27b2e0d942ab98fc9c3e74f4ea512d8f28d515b8/python/monarch/_src/__init__.py


--------------------------------------------------------------------------------
/python/monarch/_src/actor/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | """
 8 | Monarch Actor API
 9 | """
10 | 


--------------------------------------------------------------------------------
/python/monarch/_src/actor/code_sync/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from monarch._rust_bindings.monarch_extension.code_sync import (  # noqa: F401
 8 |     CodeSyncMeshClient,
 9 |     RemoteWorkspace,
10 |     WorkspaceLocation,
11 |     WorkspaceShape,
12 | )
13 | 


--------------------------------------------------------------------------------
/python/monarch/_src/actor/device_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import os
 8 | import re
 9 | from pathlib import Path
10 | 
11 | 
12 | def _local_device_count():
13 |     if "CUDA_VISIBLE_DEVICES" in os.environ:
14 |         return len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
15 |     dev_path = Path("/dev")
16 |     pattern = re.compile(r"nvidia\d+$")
17 |     nvidia_devices = [dev for dev in dev_path.iterdir() if pattern.match(dev.name)]
18 |     return len(nvidia_devices)
19 | 


--------------------------------------------------------------------------------
/python/monarch/_src/actor/telemetry/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | 
 9 | 
10 | import logging
11 | 
12 | from monarch._rust_bindings.monarch_hyperactor.telemetry import (  # @manual=//monarch/monarch_extension:monarch_extension
13 |     forward_to_tracing,
14 | )
15 | 
16 | 
17 | class TracingForwarder(logging.Handler):
18 |     def emit(self, record: logging.LogRecord) -> None:
19 |         forward_to_tracing(record)
20 | 


--------------------------------------------------------------------------------
/python/monarch/_src/tensor_engine/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the BSD-style license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/python/monarch/actor/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | """
 8 | Monarch Actor API - Public interface for actor functionality.
 9 | """
10 | 
11 | from monarch._src.actor.actor_mesh import (
12 |     Accumulator,
13 |     Actor,
14 |     ActorError,
15 |     current_actor_name,
16 |     current_rank,
17 |     current_size,
18 |     endpoint,
19 |     MonarchContext,
20 |     Point,
21 |     port,
22 |     send,
23 |     ValueMesh,
24 | )
25 | from monarch._src.actor.future import Future
26 | from monarch._src.actor.proc_mesh import (
27 |     debug_client,
28 |     local_proc_mesh,
29 |     proc_mesh,
30 |     ProcMesh,
31 |     sim_proc_mesh,
32 | )
33 | 
34 | __all__ = [
35 |     "Accumulator",
36 |     "Actor",
37 |     "ActorError",
38 |     "current_actor_name",
39 |     "current_rank",
40 |     "current_size",
41 |     "endpoint",
42 |     "Future",
43 |     "local_proc_mesh",
44 |     "MonarchContext",
45 |     "Point",
46 |     "proc_mesh",
47 |     "ProcMesh",
48 |     "port",
49 |     "send",
50 |     "sim_proc_mesh",
51 |     "ValueMesh",
52 |     "debug_client",
53 | ]
54 | 


--------------------------------------------------------------------------------
/python/monarch/actor_mesh.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import warnings
 8 | 
 9 | warnings.warn(
10 |     "monarch.actor_mesh is deprecated, please import from monarch.actor instead.",
11 |     DeprecationWarning,
12 |     stacklevel=2,
13 | )
14 | 
15 | from monarch._src.actor.actor_mesh import *  # noqa
16 | 


--------------------------------------------------------------------------------
/python/monarch/bootstrap_main.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import warnings
 8 | 
 9 | warnings.warn(
10 |     "monarch.bootstrap_main is deprecated, please use from monarch._src.actor.bootstrap_main instead.",
11 |     DeprecationWarning,
12 |     stacklevel=2,
13 | )
14 | 
15 | from monarch._src.actor.bootstrap_main import *  # noqa
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     # noqa
20 |     invoke_main()  # pragma: no cover
21 | 


--------------------------------------------------------------------------------
/python/monarch/builtins/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | """
 9 | Builtins for Monarch is a set of remote function defintions for PyTorch functions and other utilities.
10 | """
11 | 
12 | from .log import log_remote, set_logging_level_remote
13 | 
14 | __all__ = ["log_remote", "set_logging_level_remote"]
15 | 


--------------------------------------------------------------------------------
/python/monarch/builtins/log.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import logging
 8 | 
 9 | from monarch.common.remote import remote
10 | 
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | @remote(propagate="inspect")
16 | def log_remote(*args, level: int = logging.WARNING, **kwargs) -> None:
17 |     logger.log(level, *args, **kwargs)
18 | 
19 | 
20 | @remote(propagate="inspect")
21 | def set_logging_level_remote(level: int) -> None:
22 |     logger.setLevel(level)
23 | 


--------------------------------------------------------------------------------
/python/monarch/builtins/random.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre strict
 8 | from typing import Callable
 9 | 
10 | import torch
11 | from monarch.common.remote import remote
12 | 
13 | 
14 | @remote(propagate="inspect")
15 | def set_manual_seed_remote(seed: int, process_idx: int = 0) -> None:
16 |     torch.manual_seed(seed ^ process_idx)
17 | 
18 | 
19 | @remote(propagate=lambda: torch.zeros(1))
20 | def get_rng_state_remote() -> torch.Tensor:
21 |     return torch.get_rng_state()
22 | 
23 | 
24 | @remote(propagate="inspect")
25 | def set_rng_state_remote(new_state: torch.Tensor) -> None:
26 |     torch.set_rng_state(new_state)
27 | 
28 | 
29 | def _run_no_return(f: Callable) -> None:
30 |     f()
31 |     return None
32 | 
33 | 
34 | # TODO: return result when uint64 is supported from remote function
35 | @remote(propagate=lambda: _run_no_return(torch.seed))
36 | def seed_remote() -> None:
37 |     torch.seed()
38 | 
39 | 
40 | # same underlying implementation as seed_remote (torch.seed)
41 | # TODO: return result when uint64 is supported from remote function
42 | @remote(propagate=lambda: _run_no_return(torch.random.seed))
43 | def random_seed_remote() -> None:
44 |     torch.random.seed()
45 | 
46 | 
47 | @remote(propagate="inspect")
48 | def manual_seed_cuda_remote(seed: int) -> None:
49 |     torch.cuda.manual_seed(seed)
50 | 
51 | 
52 | @remote(propagate="inspect")
53 | def manual_seed_all_cuda_remote(seed: int) -> None:
54 |     torch.cuda.manual_seed_all(seed)
55 | 
56 | 
57 | @remote(propagate=lambda: [torch.zeros(1)])
58 | def get_rng_state_all_cuda_remote() -> list[torch.Tensor]:
59 |     return torch.cuda.get_rng_state_all()
60 | 
61 | 
62 | @remote(propagate="inspect")
63 | def set_rng_state_all_cuda_remote(states: list[torch.Tensor]) -> None:
64 |     torch.cuda.set_rng_state_all(states)
65 | 
66 | 
67 | # initial_seed may sometimes return a uint64 which currenly can't be unwrapped by the framework
68 | # def initial_seed_remote() -> int: ...
69 | 


--------------------------------------------------------------------------------
/python/monarch/common/_C.pyi:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | 
 9 | def patch_cuda() -> None: ...
10 | def mock_cuda() -> None: ...
11 | def unmock_cuda() -> None: ...
12 | 


--------------------------------------------------------------------------------
/python/monarch/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/monarch/27b2e0d942ab98fc9c3e74f4ea512d8f28d515b8/python/monarch/common/__init__.py


--------------------------------------------------------------------------------
/python/monarch/common/base_tensor.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-unsafe
 8 | import torch
 9 | 
10 | 
11 | # All of the tensor examples in this zoo inherit from BaseTensor.  Ideally,
12 | # however, they would inherit directly from Tensor.  This is just our staging
13 | # ground for applying behavior that hasn't yet made it into core but that
14 | # we would like to apply by default.
15 | class BaseTensor(torch.Tensor):
16 |     # See https://github.com/pytorch/pytorch/pull/73727 ; this is necessary
17 |     # to ensure that super().__new__ can cooperate with each other
18 |     @staticmethod
19 |     def __new__(cls, elem, *, requires_grad=None):
20 |         if requires_grad is None:
21 |             return super().__new__(cls, elem)
22 |         else:
23 |             return cls._make_subclass(cls, elem, requires_grad)
24 | 
25 |     # If __torch_dispatch__ is defined (which it will be for all our examples)
26 |     # the default torch function implementation (which preserves subclasses)
27 |     # typically must be disabled
28 |     __torch_function__ = torch._C._disabled_torch_function_impl
29 | 


--------------------------------------------------------------------------------
/python/monarch/common/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | 
 9 | SIM_MESH_CLIENT_TIMEOUT = 5
10 | SIM_MESH_CLIENT_SUPERVISION_UPDATE_INTERVAL = 5
11 | 


--------------------------------------------------------------------------------
/python/monarch/common/context_manager.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-unsafe
 8 | from functools import wraps
 9 | 
10 | 
11 | class _ContextManager:
12 |     def __init__(self, generator):
13 |         self.generator = generator
14 |         self.generator.send(None)
15 | 
16 |     def __enter__(self):
17 |         return
18 | 
19 |     def __exit__(self, *args):
20 |         try:
21 |             self.generator.send(None)
22 |         except StopIteration:
23 |             pass
24 |         else:
25 |             raise RuntimeError("context manager generator did not exit")
26 | 
27 | 
28 | def activate_first_context_manager(func):
29 |     """
30 |     Similar to contextlib.contextmanager but it
31 |     starts the context when the function is called rather than
32 |     than at the start of the with statement. Useful for things where
33 |     you want to optionally activate the context without a guard.
34 |     """
35 | 
36 |     @wraps(func)
37 |     def helper(*args, **kwargs):
38 |         return _ContextManager(func(*args, **kwargs))
39 | 
40 |     return helper
41 | 


--------------------------------------------------------------------------------
/python/monarch/common/fake.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-unsafe
 8 | from concurrent.futures import ThreadPoolExecutor
 9 | from functools import cache
10 | 
11 | from torch._subclasses.fake_tensor import FakeTensorMode
12 | 
13 | 
14 | @cache
15 | def _fake_mode_worker():
16 |     return ThreadPoolExecutor(max_workers=1)
17 | 
18 | 
19 | @cache
20 | def _fake_mode():
21 |     return FakeTensorMode()
22 | 
23 | 
24 | def fake_call(fn, *args, **kwargs):
25 |     """Execute on work on a ThreadPool worker
26 | 
27 |     First call (ThreadPoolExecutor init) will take the GIL and may block for long time!
28 |     TODO: this will be replaced with something more performant
29 |     """
30 |     global _fake_mode_worker, fake_mode
31 | 
32 |     # # Calls FakeTensorMode while re-enabling version counter tracking
33 |     # # todo(chilli): I'm not totally sure why I need to disable python dispatch
34 |     # # key. Perhaps there's some unwrapping that should have happened further up.
35 |     # include_to_set = torch._C._dispatch_tls_local_include_set()
36 |     # exclude_to_set = (
37 |     #     torch._C._dispatch_tls_local_exclude_set()
38 |     #     | torch._C.DispatchKeySet(torch._C.DispatchKey.Python)
39 |     # ) - torch._C.DispatchKeySet(torch._C.DispatchKey.ADInplaceOrView)
40 | 
41 |     # def work():
42 |     #     with torch._C._ForceDispatchKeyGuard(include_to_set, exclude_to_set):
43 |     #         with fake_mode:
44 |     #             return fn(*args, **kwargs)
45 | 
46 |     # return work()
47 | 
48 |     def work():
49 |         # fake mode must be initialized in the worker thread
50 |         # otherwise a monarch dispatch mode may be active, causing
51 |         # FakeTensorMode to initialize wrong.
52 |         with _fake_mode():
53 |             return fn(*args, **kwargs)
54 | 
55 |     return _fake_mode_worker().submit(work).result()
56 | 


--------------------------------------------------------------------------------
/python/monarch/common/init.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #include <Python.h>
10 | // @lint-ignore CLANGTIDY facebook-hte-RelativeInclude
11 | #include "mock_cuda.h"
12 | 
13 | static PyMethodDef _C_methods[] = {
14 |     {"patch_cuda",
15 |      patch_cuda,
16 |      METH_NOARGS,
17 |      "Initialize the monarch cuda patch."},
18 |     {"mock_cuda", mock_cuda, METH_NOARGS, "Enable cuda mocking."},
19 |     {"unmock_cuda", unmock_cuda, METH_NOARGS, "Disable cuda mocking."},
20 |     {NULL, NULL, 0, NULL}};
21 | 
22 | static struct PyModuleDef _C_module = {
23 |     PyModuleDef_HEAD_INIT,
24 |     "_C",
25 |     "A module containing monarch C++ functionality.",
26 |     -1,
27 |     _C_methods,
28 |     NULL,
29 |     NULL,
30 |     NULL,
31 |     NULL};
32 | 
33 | PyMODINIT_FUNC PyInit__C(void) {
34 |   return PyModule_Create(&_C_module);
35 | }
36 | 


--------------------------------------------------------------------------------
/python/monarch/common/mock_cuda.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #pragma once
10 | 
11 | #include <Python.h>
12 | 
13 | PyObject* patch_cuda(PyObject*, PyObject*);
14 | PyObject* mock_cuda(PyObject*, PyObject*);
15 | PyObject* unmock_cuda(PyObject*, PyObject*);
16 | 


--------------------------------------------------------------------------------
/python/monarch/common/mock_cuda.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | from contextlib import contextmanager
 9 | from typing import Generator, Optional
10 | 
11 | import monarch.common._C  # @manual=//monarch/python/monarch/common:_C
12 | import torch
13 | 
14 | monarch.common._C.patch_cuda()
15 | 
16 | _mock_cuda_stream: Optional[torch.cuda.Stream] = None
17 | 
18 | 
19 | def get_mock_cuda_stream() -> torch.cuda.Stream:
20 |     global _mock_cuda_stream
21 |     if _mock_cuda_stream is None:
22 |         _mock_cuda_stream = torch.cuda.Stream()
23 |     return _mock_cuda_stream
24 | 
25 | 
26 | @contextmanager
27 | def mock_cuda_guard() -> Generator[None, None, None]:
28 |     try:
29 |         with torch.cuda.stream(get_mock_cuda_stream()):
30 |             monarch.common._C.mock_cuda()
31 |             yield
32 |     finally:
33 |         monarch.common._C.unmock_cuda()
34 | 
35 | 
36 | def mock_cuda() -> None:
37 |     monarch.common._C.mock_cuda()
38 | 
39 | 
40 | def unmock_cuda() -> None:
41 |     monarch.common._C.unmock_cuda()
42 | 


--------------------------------------------------------------------------------
/python/monarch/common/process_group.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-unsafe
 8 | 
 9 | import logging
10 | 
11 | import torch.distributed as dist
12 | 
13 | logger = logging.getLogger(__name__)
14 | 
15 | 
16 | def _wrap_method(process_group: dist.ProcessGroup, method):
17 |     def wrapper(*args, **kwargs):
18 |         logger.debug(
19 |             "ProcessGroup Call: %s with args %s and kwargs %s", method, args, kwargs
20 |         )
21 |         fn = getattr(process_group, method)
22 |         try:
23 |             return fn(*args, **kwargs)
24 |         except Exception as e:
25 |             logger.warning(
26 |                 "ProcessGroup Call: %s with args %s and kwargs %s failed with exception: %s",
27 |                 method,
28 |                 args,
29 |                 kwargs,
30 |                 str(e),
31 |             )
32 |             # TODO(rajeshn): send a message back to the controller that this
33 |             # worker had a failed communication event
34 |             raise e
35 | 
36 |     return wrapper
37 | 
38 | 
39 | class SingleControllerProcessGroupWrapper:
40 |     """
41 |     Wraps a ProcessGroup object to provide a single controller process group. This provides us a hook to observe
42 |     all the operatons on the process group to the controller.
43 |     """
44 | 
45 |     def __new__(cls, pg: dist.ProcessGroup):
46 |         instance = super().__new__(cls)
47 | 
48 |         for attr in dir(type(pg)):
49 |             if not attr.startswith("__") and callable(getattr(type(pg), attr)):
50 |                 setattr(instance, attr, _wrap_method(pg, attr))
51 | 
52 |         return instance
53 | 
54 |     def __init__(self, process_group):
55 |         self.process_group = process_group
56 | 


--------------------------------------------------------------------------------
/python/monarch/common/reference.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-unsafe
 8 | from typing import Optional
 9 | 
10 | from monarch._rust_bindings.monarch_extension.tensor_worker import Ref
11 | 
12 | 
13 | class Referenceable:
14 |     def __init__(self):
15 |         self.ref: Optional[int] = None
16 | 
17 |     def delete_ref(self, ref):
18 |         raise NotImplementedError("no delete_ref method")
19 | 
20 |     def __reduce_ex__(self, protocol):
21 |         assert (
22 |             self.ref is not None
23 |         ), f"{self} is being sent but does not have a reference"
24 |         return Ref, (self.ref,)
25 | 
26 |     # Used by rust backend to get the ref for this object
27 |     def __monarch_ref__(self) -> int:
28 |         assert self.ref is not None
29 |         return self.ref
30 | 
31 |     def __del__(self):
32 |         if self.ref is not None:
33 |             self.delete_ref(self.ref)
34 | 


--------------------------------------------------------------------------------
/python/monarch/common/selection.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from monarch._rust_bindings.monarch_hyperactor.selection import Selection
 8 | 
 9 | __all__ = ["Selection"]
10 | 


--------------------------------------------------------------------------------
/python/monarch/common/tensor_factory.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-unsafe
 8 | from typing import NamedTuple, Tuple
 9 | 
10 | import torch
11 | 
12 | 
13 | class TensorFactory(NamedTuple):
14 |     size: Tuple[int, ...]
15 |     dtype: torch.dtype
16 |     layout: torch.layout
17 |     device: torch.device
18 | 
19 |     @staticmethod
20 |     def from_tensor(t):
21 |         return TensorFactory(t.size(), t.dtype, t.layout, t.device)
22 | 
23 |     def empty(self):
24 |         return torch.empty(
25 |             self.size, dtype=self.dtype, layout=self.layout, device=self.device
26 |         )
27 | 
28 |     def zeros(self):
29 |         return torch.full(
30 |             self.size, 0, dtype=self.dtype, layout=self.layout, device=self.device
31 |         )
32 | 


--------------------------------------------------------------------------------
/python/monarch/controller/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the BSD-style license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 
7 | # pyre-strict
8 | 


--------------------------------------------------------------------------------
/python/monarch/controller/debugger.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | import sys
 9 | from typing import Optional
10 | 
11 | _is_ipython: Optional[bool] = None
12 | 
13 | 
14 | def is_ipython() -> bool:
15 |     global _is_ipython
16 |     if _is_ipython is not None:
17 |         return _is_ipython
18 |     try:
19 |         from IPython import get_ipython
20 | 
21 |         _is_ipython = get_ipython() is not None
22 |     except ImportError:
23 |         _is_ipython = False
24 |     return _is_ipython
25 | 
26 | 
27 | def write(msg: str) -> None:
28 |     sys.stdout.write(msg)
29 |     sys.stdout.flush()
30 | 
31 | 
32 | def read(requested_size: int) -> bytes:
33 |     if not is_ipython():
34 |         b = bytearray(requested_size)
35 |         bytes_read = sys.stdin.buffer.raw.readinto(b)
36 |         return bytes(b[:bytes_read])
37 | 
38 |     # ipython doesn't have stdin directly connected
39 |     # so we need to use input() instead.
40 |     user_input = input() + "\n"
41 |     input_bytes = user_input.encode("utf-8")
42 |     num_bytes_to_write = len(input_bytes)
43 |     if requested_size < num_bytes_to_write:
44 |         raise RuntimeError(
45 |             f"Debugger input line too long, max length is {requested_size}"
46 |         )
47 |     return input_bytes[:num_bytes_to_write]
48 | 


--------------------------------------------------------------------------------
/python/monarch/controller/rust_backend/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the BSD-style license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 
7 | # pyre-strict
8 | 


--------------------------------------------------------------------------------
/python/monarch/fetch.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-unsafe
 8 | """
 9 | This is a utility file for fetching a shard of a tensor from remote.
10 | """
11 | 
12 | from typing import cast, TypeVar
13 | 
14 | from monarch.common.device_mesh import no_mesh
15 | 
16 | from monarch.common.future import Future
17 | 
18 | from monarch.common.remote import call_on_shard_and_fetch, remote_identity
19 | 
20 | T = TypeVar("T")
21 | 
22 | 
23 | def fetch_shard(
24 |     obj: T, shard: dict[str, int] | None = None, **kwargs: int
25 | ) -> Future[T]:
26 |     """
27 |     Retrieve the shard at `coordinates` of the current device mesh of each
28 |     tensor in obj. All tensors in `obj` will be fetched to the CPU device.
29 |         obj - a pytree containing the tensors the fetch
30 |         shard - a dictionary from mesh dimension name to coordinate of the shard
31 |                 If None, this will fetch from coordinate 0 for all dimensions (useful after all_reduce/all_gather)
32 |         preprocess - a
33 |         **kwargs - additional keyword arguments are added as entries to the shard dictionary
34 |     """
35 |     if kwargs:
36 |         if shard is None:
37 |             shard = {}
38 |         shard.update(kwargs)
39 | 
40 |     return cast("Future[T]", call_on_shard_and_fetch(remote_identity, obj, shard=shard))
41 | 
42 | 
43 | def show(obj: T, shard: dict[str, int] | None = None, **kwargs: int) -> object:
44 |     v = inspect(obj, shard=shard, **kwargs)
45 |     # pyre-ignore
46 |     from torchshow import show  # @manual
47 | 
48 |     with no_mesh.activate():
49 |         return show(v)
50 | 
51 | 
52 | def inspect(obj: T, shard: dict[str, int] | None = None, **kwargs: int) -> T:
53 |     return fetch_shard(obj, shard=shard, **kwargs).result()
54 | 


--------------------------------------------------------------------------------
/python/monarch/gradient/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | 
 9 | from ._gradient_generator import GradientGenerator
10 | 
11 | __all__ = ["GradientGenerator"]
12 | 


--------------------------------------------------------------------------------
/python/monarch/gradient/_gradient_generator.pyi:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-unsafe
 8 | from typing import Any, Optional
 9 | 
10 | import torch
11 | 
12 | class GradientGenerator:
13 |     def __init__(
14 |         self,
15 |         roots_list: Any,
16 |         with_respect_to: Any,
17 |         grad_roots: Any,
18 |         context_restorer: Any,
19 |     ): ...
20 |     # pyre-ignore[11]: Annotation `torch.Tensor` is not defined as a type.
21 |     def __next__(self) -> Optional[torch.Tensor]: ...
22 |     def __iter__(self) -> "GradientGenerator": ...
23 | 


--------------------------------------------------------------------------------
/python/monarch/memory.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-unsafe
 8 | import itertools
 9 | import os
10 | from pathlib import Path
11 | 
12 | import torch
13 | from monarch.common.remote import remote
14 | 
15 | 
16 | PATH_KEY = "dir_snapshots"
17 | _counter = itertools.count()
18 | 
19 | 
20 | @remote(propagate="inspect")
21 | def record_memory_history() -> None:
22 |     torch.cuda.memory._record_memory_history()
23 | 
24 | 
25 | def dump_memory_snapshot(*args, **kwargs) -> None:
26 |     """
27 |     This function wraps torch.cuda.memory._dump_snapshot() to dump memory snapshot remotely.
28 |     """
29 |     assert isinstance(
30 |         kwargs.get(PATH_KEY, None), str
31 |     ), f"{PATH_KEY} must be passed and must be a string to represent the path to save the memory snapshots."
32 |     id = next(_counter)
33 |     _memory_controller_dump(id, *args, **kwargs)
34 | 
35 | 
36 | @remote(propagate="inspect")
37 | def _memory_controller_dump(ident, *args, **kwargs) -> None:
38 |     dir_path = Path(kwargs[PATH_KEY]).absolute()
39 |     os.makedirs(dir_path, exist_ok=True)
40 |     # This is not a synchronized call, so it is okay to call without device mesh.
41 |     rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
42 |     snapshot_path = f"{dir_path}/snapshot_{rank}.pickle"
43 |     torch.cuda.memory._dump_snapshot(filename=snapshot_path)
44 | 


--------------------------------------------------------------------------------
/python/monarch/parallel/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from monarch.parallel.pipelining.runtime import get_parameter_udf, PipelineParallelism
 8 | 
 9 | __all__ = ["PipelineParallelism", "get_parameter_udf"]
10 | 


--------------------------------------------------------------------------------
/python/monarch/parallel/pipelining/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the BSD-style license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 
7 | # pyre-strict
8 | 


--------------------------------------------------------------------------------
/python/monarch/proc_mesh.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import warnings
 8 | 
 9 | warnings.warn(
10 |     "monarch.proc_mesh is deprecated, please import from monarch.actor instead.",
11 |     DeprecationWarning,
12 |     stacklevel=2,
13 | )
14 | 
15 | from monarch._src.actor.proc_mesh import *  # noqa
16 | 


--------------------------------------------------------------------------------
/python/monarch/random.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import os
 8 | from typing import NamedTuple, Tuple
 9 | 
10 | import torch
11 | from monarch.common.remote import remote
12 | from monarch.common.tensor import Tensor
13 | 
14 | 
15 | class State(NamedTuple):
16 |     cpu: Tensor
17 |     cuda: Tensor
18 | 
19 | 
20 | @remote(
21 |     propagate=lambda: (
22 |         torch.empty(5056, dtype=torch.uint8),
23 |         torch.empty(16, dtype=torch.uint8),
24 |     )
25 | )
26 | def _get_state() -> Tuple[torch.Tensor, torch.Tensor]:
27 |     return (torch.get_rng_state(), torch.cuda.get_rng_state())
28 | 
29 | 
30 | @remote(propagate=lambda state: None)
31 | def set_state(state: Tuple[Tensor, Tensor]):
32 |     cpu, device = state
33 |     torch.set_rng_state(cpu)
34 |     torch.cuda.set_rng_state(device)
35 | 
36 | 
37 | @remote(propagate=lambda _: None)
38 | def _manual_seed(seed: torch.Tensor):
39 |     torch.manual_seed(seed.item())
40 | 
41 | 
42 | @remote(propagate=lambda: None)
43 | def make_deterministic():
44 |     torch.use_deterministic_algorithms(True)
45 |     torch.backends.cudnn.deterministic = True
46 |     torch.backends.cudnn.benchmark = False
47 |     # env var for deterministic CuBLAS
48 |     # https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html
49 |     os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
50 | 
51 | 
52 | def get_state() -> State:
53 |     return State(*_get_state())
54 | 
55 | 
56 | def new_state(seed: Tensor) -> State:
57 |     orig = get_state()
58 |     _manual_seed(seed)
59 |     mine = get_state()
60 |     set_state(orig)
61 |     return mine
62 | 


--------------------------------------------------------------------------------
/python/monarch/simulator/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the BSD-style license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 
7 | # pyre-strict
8 | 


--------------------------------------------------------------------------------
/python/monarch/simulator/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-unsafe
 8 | import contextlib
 9 | 
10 | META_VAL = []
11 | 
12 | 
13 | @contextlib.contextmanager
14 | def set_meta(new_value):
15 |     # Sets the metadata for any tasks created under this
16 |     global META_VAL
17 |     META_VAL.append(new_value)
18 |     try:
19 |         yield
20 |     finally:
21 |         META_VAL.pop()
22 | 


--------------------------------------------------------------------------------
/python/monarch/simulator/interface.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from typing import Union
 8 | 
 9 | from monarch._src.actor.shape import NDSlice
10 | 
11 | from monarch.common.client import Client as _Client
12 | from monarch.common.device_mesh import DeviceMesh
13 | 
14 | from monarch.simulator.ir import IRGraph
15 | from monarch.simulator.simulator import (
16 |     SimulatorBackendMode,
17 |     SimulatorController as _SimulatorController,
18 |     SimulatorInterface,
19 |     SimulatorTraceMode,
20 | )
21 | 
22 | 
23 | def Simulator(
24 |     hosts: int,
25 |     gpus: int,
26 |     *,
27 |     simulate_mode: Union["str", SimulatorBackendMode] = SimulatorBackendMode.SIMULATE,
28 |     trace_mode: Union["str", SimulatorTraceMode] = SimulatorTraceMode.STREAM_ONLY,
29 |     upload_trace: bool = False,
30 |     trace_path: str = "trace.json",
31 |     command_history_path: str = "command_history.pkl",
32 |     group_workers: bool = False,
33 |     build_ir: bool = False,
34 | ) -> "SimulatorInterface":
35 |     if isinstance(simulate_mode, str):
36 |         simulate_mode = getattr(SimulatorBackendMode, simulate_mode.upper())
37 |     if isinstance(trace_mode, str):
38 |         trace_mode = getattr(SimulatorTraceMode, trace_mode.upper())
39 | 
40 |     ir = IRGraph() if build_ir else None
41 |     ctrl = _SimulatorController(
42 |         hosts * gpus,
43 |         gpu_per_host=gpus,
44 |         simulate_mode=simulate_mode,
45 |         trace_mode=trace_mode,
46 |         upload_trace=upload_trace,
47 |         trace_path=trace_path,
48 |         command_history_path=command_history_path,
49 |         group_workers=group_workers,
50 |         ir=ir,
51 |     )
52 |     client = _Client(ctrl, ctrl.world_size, ctrl.gpu_per_host)
53 |     dm = DeviceMesh(
54 |         client,
55 |         NDSlice(offset=0, sizes=[hosts, gpus], strides=[gpus, 1]),
56 |         ("host", "gpu"),
57 |     )
58 | 
59 |     dm.exit = lambda: client.shutdown()
60 |     return SimulatorInterface(dm, ctrl, ir)
61 | 


--------------------------------------------------------------------------------
/python/monarch/simulator/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-unsafe
 8 | import os
 9 | 
10 | import numpy as np
11 | 
12 | 
13 | def file_path_with_iter(file_path: str, iter_count: int) -> str:
14 |     dir_path = os.path.dirname(file_path)
15 |     file_name, file_postfix = os.path.basename(file_path).split(".")
16 |     file_name = f"{file_name}_{iter_count}.{file_postfix}"
17 |     return os.path.join(dir_path, file_name)
18 | 
19 | 
20 | def compress_workers_range(workers) -> str:
21 |     regions = []
22 |     start = workers[0]
23 |     end = workers[0]
24 |     sorted_workers = np.sort(workers)
25 |     for i in range(1, len(sorted_workers)):
26 |         if workers[i] == end + 1:
27 |             end = workers[i]
28 |         else:
29 |             regions.append(f"[{start}-{end}]")
30 |             start = workers[i]
31 |             end = workers[i]
32 |     regions.append(f"[{start}-{end}]")
33 |     return " ".join(regions)
34 | 
35 | 
36 | def clean_name(name: str) -> str:
37 |     if name.startswith("torch.ops.aten."):
38 |         name = name[len("torch.ops.") :]  # noqa: whitespace before ':'
39 |     if name.endswith(".default"):
40 |         name = name[: -len(".default")]
41 |     return name
42 | 


--------------------------------------------------------------------------------
/python/monarch/timer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .execution_timer import (
 8 |     execution_timer_start,
 9 |     execution_timer_stop,
10 |     ExecutionTimer,
11 |     get_execution_timer_average_ms,
12 |     get_latest_timer_measurement,
13 | )
14 | 
15 | __all__ = [
16 |     "ExecutionTimer",
17 |     "execution_timer_start",
18 |     "execution_timer_stop",
19 |     "get_latest_timer_measurement",
20 |     "get_execution_timer_average_ms",
21 | ]
22 | 


--------------------------------------------------------------------------------
/python/monarch/timer/example_spmd.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | """An example that demonstrates how to use ExecutionTimer in a SPMD style program.
 8 | 
 9 | Run this with:
10 | buck run //monarch/python/monarch/timer:example_spmd
11 | """
12 | 
13 | import time
14 | 
15 | # pyre-strict
16 | 
17 | import torch
18 | from monarch.timer import ExecutionTimer
19 | 
20 | 
21 | def main() -> None:
22 |     # Check if CUDA is available
23 |     if not torch.cuda.is_available():
24 |         print("CUDA is not available. Exiting.")
25 |         return
26 | 
27 |     device = torch.device("cuda")
28 | 
29 |     num_iterations = 5
30 | 
31 |     a = torch.randn(1000, 1000, device=device)
32 |     b = torch.randn(1000, 1000, device=device)
33 | 
34 |     # Warmup
35 |     torch.matmul(a, b)
36 |     torch.cuda.synchronize()
37 | 
38 |     cpu_timings = []
39 |     for _ in range(num_iterations):
40 |         t0 = time.perf_counter()
41 |         torch.matmul(a, b)
42 |         cpu_timings.append(time.perf_counter() - t0)
43 | 
44 |     for _ in range(num_iterations):
45 |         with ExecutionTimer.time("matrix_multiply"):
46 |             torch.matmul(a, b)
47 | 
48 |     mean_cuda_ms = ExecutionTimer.summary()["matrix_multiply"]["mean_ms"]
49 |     mean_perfcounter_ms = sum(cpu_timings) / len(cpu_timings) * 1000
50 |     print("mean perf counter times: ", mean_perfcounter_ms)
51 |     print("mean cuda times: ", mean_cuda_ms)
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/python/monarch/tools/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the BSD-style license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 
7 | # pyre-strict
8 | 


--------------------------------------------------------------------------------
/python/monarch/tools/components/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the BSD-style license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 
7 | # pyre-strict
8 | 


--------------------------------------------------------------------------------
/python/monarch/tools/config/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | from dataclasses import dataclass, field
 9 | from typing import Any, Optional
10 | 
11 | 
12 | NOT_SET: str = "__NOT_SET__"
13 | 
14 | 
15 | @dataclass
16 | class Config:
17 |     scheduler: str = NOT_SET
18 |     scheduler_args: dict[str, Any] = field(default_factory=dict)
19 |     workspace: Optional[str] = None
20 |     dryrun: bool = False
21 | 


--------------------------------------------------------------------------------
/python/monarch/tools/config/defaults.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | 
 9 | """Defines defaults for ``monarch.tools``"""
10 | 
11 | from typing import Callable, Optional
12 | 
13 | from monarch.tools.components import hyperactor
14 | from monarch.tools.config import Config
15 | 
16 | from torchx import specs
17 | from torchx.schedulers import (
18 |     docker_scheduler,
19 |     kubernetes_scheduler,
20 |     local_scheduler,
21 |     SchedulerFactory,
22 |     slurm_scheduler,
23 | )
24 | 
25 | 
26 | def component_fn(scheduler: str) -> Callable[..., specs.AppDef]:
27 |     """The default TorchX component function for the scheduler"""
28 |     return hyperactor.proc_mesh
29 | 
30 | 
31 | def scheduler_factories() -> dict[str, SchedulerFactory]:
32 |     """Supported schedulers (name -> scheduler static factory method)"""
33 |     return {  # pyre-ignore[7]
34 |         # --- local schedulers (no multi-host support) ---
35 |         "local_cwd": local_scheduler.create_scheduler,
36 |         "local_docker": docker_scheduler.create_scheduler,
37 |         # --- remote schedulers (yes multi-host support) ---
38 |         "slurm": slurm_scheduler.create_scheduler,
39 |         "k8s": kubernetes_scheduler.create_scheduler,
40 |     }
41 | 
42 | 
43 | def config(scheduler: str, workspace: Optional[str] = None) -> Config:
44 |     """The default :py:class:`~monarch.tools.config.Config` to use when submitting to the provided ``scheduler``."""
45 |     return Config(scheduler=scheduler, workspace=workspace)
46 | 
47 | 
48 | def dryrun_info_formatter(dryrun_info: specs.AppDryRunInfo) -> Callable[..., str]:
49 |     """Used to attach a formatter to the dryrun info when running
50 |     :py:function:`~monarch.tools.commands.create` in ``dryrun`` mode so that
51 |     the returned ``AppDryrunInfo`` can be printed to console.
52 |     """
53 |     # no-op, use the default formatter already attached to the dryrun info
54 |     return dryrun_info._fmt
55 | 


--------------------------------------------------------------------------------
/python/monarch/tools/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | import os
 9 | from typing import Optional
10 | 
11 | 
12 | class conda:
13 |     """Conda related util functions."""
14 | 
15 |     @staticmethod
16 |     def active_env_dir() -> Optional[str]:
17 |         """
18 |         Returns the currently active conda environment's directory.
19 |         `None` if run outside of a conda environment.
20 |         """
21 |         return os.getenv("CONDA_PREFIX")
22 | 
23 |     @staticmethod
24 |     def active_env_name() -> Optional[str]:
25 |         """
26 |         Returns the currently active conda environment name.
27 |         `None` if run outside of a conda environment.
28 |         """
29 |         env_name = os.getenv("CONDA_DEFAULT_ENV")
30 | 
31 |         if not env_name:
32 |             # conda envs activated with metaconda doesn't set CODNA_DEFAULT_ENV so
33 |             # fallback to CONDA_PREFIX which points to the path of the currently active conda environment
34 |             # e.g./home/$USER/.conda/envs/{env_name}
35 |             if env_dir := conda.active_env_dir():
36 |                 env_name = os.path.basename(env_dir)
37 | 
38 |         return env_name
39 | 


--------------------------------------------------------------------------------
/python/monarch/worker/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the BSD-style license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 
7 | # pyre-strict
8 | 


--------------------------------------------------------------------------------
/python/monarch/worker/lines.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-unsafe
 8 | from contextlib import contextmanager
 9 | from typing import Any, List
10 | 
11 | 
12 | class Lines:
13 |     """
14 |     Simple way to emit code where we track a per-line context object.
15 |     """
16 | 
17 |     def __init__(self, context=None):
18 |         self._lines: List[str] = []
19 |         self._context: List[Any] = []
20 |         self._current_context = context
21 | 
22 |     def get_context(self, lineno) -> Any:
23 |         return self._context[lineno - 1]
24 | 
25 |     @contextmanager
26 |     def context(self, obj: Any):
27 |         old, self._current_context = self._current_context, obj
28 |         try:
29 |             yield
30 |         finally:
31 |             self._current_context = old
32 | 
33 |     def emit(self, lines: str) -> None:
34 |         self._lines.extend(lines.split("\n"))
35 |         while len(self._context) < len(self._lines):
36 |             self._context.append(self._current_context)
37 | 
38 |     def emit_lines(self, lines: "Lines") -> None:
39 |         """
40 |         Append another lines object on this one,
41 |         preserving its per-line context.
42 |         """
43 |         self._lines.extend(lines._lines)
44 |         self._context.extend(lines._context)
45 | 
46 |     def text(self) -> str:
47 |         return "\n".join(self._lines)
48 | 


--------------------------------------------------------------------------------
/python/monarch/worker/monitor.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | 
 9 | import math
10 | import queue
11 | import threading
12 | from typing import Callable, Optional, Tuple
13 | 
14 | from monarch_supervisor import TTL
15 | 
16 | 
17 | class Monitor:
18 |     """A monitor is a thread that watches for reported events to expire."""
19 | 
20 |     def __init__(self) -> None:
21 |         self.thread = threading.Thread(target=self._main, daemon=True, name="monitor")
22 |         self.events: queue.Queue[Tuple[Callable[[], None], Callable[[], float]]] = (
23 |             queue.Queue()
24 |         )
25 |         self.events.put((lambda: None, TTL(None)))
26 | 
27 |     def start(self) -> None:
28 |         """Start the monitor thread."""
29 |         self.thread.start()
30 | 
31 |     def _main(self) -> None:
32 |         debug, ttl = self.events.get()
33 |         while True:
34 |             try:
35 |                 timeout = ttl()
36 |                 next_debug, next_ttl = self.events.get(
37 |                     timeout=None if timeout == math.inf else timeout
38 |                 )
39 |             except queue.Empty:
40 |                 debug()
41 |                 next_debug, next_ttl = self.events.get(timeout=None)
42 | 
43 |             debug, ttl = next_debug, next_ttl
44 | 
45 |     def __call__(
46 |         self,
47 |         debug_fn: Callable[[], None] = lambda: None,
48 |         timeout: Optional[float] = None,
49 |     ) -> None:
50 |         """Start a new event with the provided timeout.
51 |         If a timeout is specified, and a new event is not reported by before it expires,
52 |         the provided debug_fn is called."""
53 |         self.events.put((debug_fn, TTL(timeout)))
54 | 


--------------------------------------------------------------------------------
/python/monarch/world_mesh.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-unsafe
 8 | 
 9 | from typing import List
10 | 
11 | from monarch._src.actor.shape import NDSlice
12 | 
13 | from monarch.common.client import Client
14 | 
15 | from monarch.common.device_mesh import DeviceMesh
16 | 
17 | from monarch.controller.backend import ProcessBackend
18 | 
19 | from monarch.controller.controller import Controller
20 | from monarch_supervisor import Context, Host
21 | 
22 | 
23 | def world_mesh(
24 |     ctx: Context,
25 |     hosts: List[Host],
26 |     gpu_per_host: int,
27 |     _processes=None,
28 | ) -> DeviceMesh:
29 |     backend = ProcessBackend(ctx, hosts, gpu_per_host, _processes=_processes)
30 |     client = Client(Controller(backend), backend.world_size, backend.gpu_per_host)
31 |     return DeviceMesh(
32 |         client,
33 |         NDSlice(offset=0, sizes=[len(hosts), gpu_per_host], strides=[gpu_per_host, 1]),
34 |         ("host", "gpu"),
35 |     )
36 | 


--------------------------------------------------------------------------------
/python/monarch_supervisor/_testing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-unsafe
 8 | from typing import NamedTuple
 9 | 
10 | from monarch_supervisor import get_message_queue
11 | 
12 | 
13 | class Reply(NamedTuple):
14 |     a: int
15 |     b: int
16 |     x: int
17 | 
18 | 
19 | def reply_hello(a, b, x):
20 |     q = get_message_queue()
21 |     q.send(Reply(a, b, x))
22 | 
23 | 
24 | def echo():
25 |     q = get_message_queue()
26 |     i = 0
27 |     while True:
28 |         sender, m = q.recv()
29 |         if m == "exit":
30 |             break
31 |         assert m == i
32 |         q.send(m)
33 |         i += 1
34 | 
35 | 
36 | class Mapper:
37 |     def map(self, items):
38 |         return sum(x * 2 for x in items)
39 | 
40 |     def reduce(self, items):
41 |         return sum(items)
42 | 
43 |     def finish(self, result):
44 |         return result
45 | 


--------------------------------------------------------------------------------
/python/monarch_supervisor/diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/monarch/27b2e0d942ab98fc9c3e74f4ea512d8f28d515b8/python/monarch_supervisor/diagram.png


--------------------------------------------------------------------------------
/python/monarch_supervisor/function_call.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-unsafe
 8 | import importlib.util
 9 | import sys
10 | 
11 | from monarch_supervisor import _FunctionCall, get_message_queue
12 | 
13 | if __name__ == "__main__":
14 |     q = get_message_queue()
15 |     _, call = q.recv()
16 |     assert isinstance(call, _FunctionCall)
17 |     filename, *rest = call.target.split(":", 1)
18 |     if not rest:
19 |         modulename, funcname = filename.rsplit(".", 1)
20 |         module = importlib.import_module(modulename)
21 |     else:
22 |         spec = importlib.util.spec_from_file_location("__entry__", filename)
23 |         assert spec is not None and spec.loader is not None
24 |         module = importlib.util.module_from_spec(spec)
25 |         # pyre-ignore[16]
26 |         spec.loader.exec_module(module)
27 |         sys.modules["__entry__"] = module
28 |         funcname = rest[0]
29 |     func = getattr(module, funcname)
30 |     func(*call.args, **call.kwargs)
31 | 


--------------------------------------------------------------------------------
/python/monarch_supervisor/log_pstree.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | import logging
 9 | import subprocess
10 | import sys
11 | from typing import Optional
12 | 
13 | from monarch_supervisor.logging import gethostname, initialize_logging
14 | 
15 | pid: str
16 | logger: logging.Logger = logging.getLogger(__name__)
17 | 
18 | 
19 | def extract_pss(pid: str) -> Optional[str]:
20 |     try:
21 |         with open(f"/proc/{pid}/smaps_rollup", "r") as f:
22 |             for line in f.readlines():
23 |                 if line.startswith("Pss:"):  # Check if the line starts with 'Pss:'
24 |                     return " ".join(line.split()[1:3])
25 |     except Exception:
26 |         pass
27 |     return None
28 | 
29 | 
30 | def log_pstree_output(pid: int) -> None:
31 |     pstree_output = subprocess.check_output(["pstree", "-Tap", str(pid)]).decode(
32 |         "utf-8"
33 |     )
34 |     lines = pstree_output.split("\n")
35 |     logger.info("Process Info")
36 |     for line in lines:
37 |         if not line.strip():
38 |             continue
39 |         parts = line.split(",")
40 |         pids = parts[1].split()[0]
41 |         mem = extract_pss(pids)
42 |         logger.info(f"{line} {mem}")
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     (pid,) = sys.argv[1:]
47 |     initialize_logging(f"{gethostname()} host-manager")
48 |     log_pstree_output(int(pid))
49 | 


--------------------------------------------------------------------------------
/python/monarch_supervisor/python_executable.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import importlib.resources
 8 | import os
 9 | import sys
10 | 
11 | try:
12 |     from __manifest__ import fbmake  # noqa
13 | 
14 |     IN_PAR = True
15 | except ImportError:
16 |     IN_PAR = False
17 | 
18 | PYTHON_EXECUTABLE: str
19 | if IN_PAR:
20 |     # The worker bootstrap binary will import this supervisor lib. When that
21 |     # happens don't try to search for the bootstrap binary again, just use the
22 |     # current executable.
23 |     import __main__ as main_module  # @manual
24 | 
25 |     if hasattr(main_module, "__MONARCH_TENSOR_WORKER_ENV__"):
26 |         PYTHON_EXECUTABLE = os.environ["FB_XAR_INVOKED_NAME"]
27 |     else:
28 |         try:
29 |             with importlib.resources.path(
30 |                 "monarch_tensor_worker_env", "worker_env"
31 |             ) as path:
32 |                 if not path.exists():
33 |                     raise ImportError()
34 |                 PYTHON_EXECUTABLE = str(path)
35 |         except ImportError:
36 |             raise ImportError(
37 |                 "Monarch worker env not found, please define a custom 'monarch_tensor_worker_env' or "
38 |                 "add '//monarch/python/monarch_supervisor/worker:default_worker_env' "
39 |                 "to your binary dependencies in TARGETS"
40 |             )
41 | else:
42 |     PYTHON_EXECUTABLE = sys.executable
43 | 


--------------------------------------------------------------------------------
/python/monarch_supervisor/worker/worker_env.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-unsafe
 8 | import runpy
 9 | import sys
10 | 
11 | __MONARCH_TENSOR_WORKER_ENV__ = True
12 | 
13 | 
14 | def main() -> None:
15 |     assert sys.argv[1] == "-m"
16 |     main_module = sys.argv[2]
17 | 
18 |     # Remove the -m and the main module from the command line arguments before
19 |     # forwarding
20 |     sys.argv[1:] = sys.argv[3:]
21 |     # pyre-fixme[16]: Module `runpy` has no attribute `_run_module_as_main`.
22 |     runpy._run_module_as_main(main_module, alter_argv=False)
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     main()
27 | 


--------------------------------------------------------------------------------
/python/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/monarch/27b2e0d942ab98fc9c3e74f4ea512d8f28d515b8/python/tests/__init__.py


--------------------------------------------------------------------------------
/python/tests/_monarch/test_actor.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | 
 9 | import time
10 | 
11 | from monarch._rust_bindings.monarch_hyperactor.actor import (
12 |     PythonMessage,
13 |     PythonMessageKind,
14 | )
15 | 
16 | 
17 | def test_python_message() -> None:
18 |     """
19 |     Verifies that PythonMessage can be constructed reasonably fast.
20 |     """
21 |     method: str = "test_method"
22 |     payload: str = "a" * 2**30  # 1gb
23 |     blob: bytes = payload.encode("utf-8")
24 |     t = time.time()
25 |     PythonMessage(PythonMessageKind.CallMethod(method, None), blob)
26 |     t_spent = time.time() - t
27 |     assert t_spent < 1
28 | 


--------------------------------------------------------------------------------
/python/tests/_monarch/test_client.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | 
 9 | from unittest import TestCase
10 | 
11 | import torch
12 | from monarch._rust_bindings.monarch_extension import client
13 | 
14 | from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
15 | from pyre_extensions import none_throws
16 | 
17 | 
18 | class TestClient(TestCase):
19 |     def test_simple_with_error_response(self) -> None:
20 |         err = client.Error.new_for_unit_test(
21 |             7,
22 |             8,
23 |             ActorId(world_name="test", rank=0, actor_name="actor"),
24 |             "test error",
25 |         )
26 |         resp = client.WorkerResponse.new_for_unit_test(
27 |             seq=10,
28 |             response=err,
29 |         )
30 |         self.assertTrue(resp.is_exception())
31 |         exc = none_throws(resp.exception())
32 |         assert isinstance(exc, client.Error)
33 | 
34 |         self.assertEqual(exc.backtrace, "test error")
35 |         self.assertEqual(resp.result(), None)
36 |         self.assertEqual(resp.seq, 10)
37 | 
38 |     def test_simple_with_result_response(self) -> None:
39 |         resp = client.WorkerResponse.new_for_unit_test(
40 |             seq=11,
41 |             response={"test": 1},
42 |         )
43 |         self.assertFalse(resp.is_exception())
44 |         self.assertEqual(resp.exception(), None)
45 |         self.assertEqual(resp.result(), {"test": 1})
46 |         self.assertEqual(resp.seq, 11)
47 | 
48 |     def test_tensor(self) -> None:
49 |         tensor = torch.rand(3)
50 |         resp = client.WorkerResponse.new_for_unit_test(
51 |             seq=11,
52 |             response={"result": tensor},
53 |         )
54 |         self.assertTrue(torch.equal(resp.result()["result"], tensor))
55 | 


--------------------------------------------------------------------------------
/python/tests/dispatch_bench_helper.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-unsafe
 8 | import torch
 9 | 
10 | from monarch.common.remote import remote
11 | 
12 | 
13 | def run_loop_local(n_iters, tensor_shape=(2, 2)):
14 |     local = torch.zeros(*tensor_shape)
15 |     ones = torch.ones(*tensor_shape)
16 |     for _ in range(n_iters):
17 |         local = ones + local
18 |     return local
19 | 
20 | 
21 | def _run_loop(*args, **kwargs):
22 |     return torch.ones(args[1])
23 | 
24 | 
25 | run_loop = remote("tests.dispatch_bench_helper.run_loop_local", propagate=_run_loop)
26 | 


--------------------------------------------------------------------------------
/python/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | pytest-timeout
3 | pytest-asyncio
4 | pytest-xdist
5 | pyright
6 | 


--------------------------------------------------------------------------------
/python/tests/simulator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch-labs/monarch/27b2e0d942ab98fc9c3e74f4ea512d8f28d515b8/python/tests/simulator/__init__.py


--------------------------------------------------------------------------------
/python/tests/sleep_binary.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 3 | # All rights reserved.
 4 | #
 5 | # This source code is licensed under the BSD-style license found in the
 6 | # LICENSE file in the root directory of this source tree.
 7 | 
 8 | # pyre-strict
 9 | 
10 | """
11 | A simple binary that calls the sleep_indefinitely_for_unit_tests function from the monarch extension.
12 | This is used to test the signal handling behavior of signal_safe_block_on.
13 | """
14 | 
15 | import sys
16 | 
17 | from monarch._rust_bindings.monarch_hyperactor.runtime import (  # @manual
18 |     sleep_indefinitely_for_unit_tests,
19 | )
20 | 
21 | 
22 | def main() -> None:
23 |     print("Starting sleep_binary. Process will sleep indefinitely until interrupted.")
24 |     sys.stdout.flush()  # Ensure the message is printed before we sleep
25 | 
26 |     try:
27 |         # This will sleep indefinitely until interrupted by a signal
28 |         sleep_indefinitely_for_unit_tests()
29 |     except KeyboardInterrupt:
30 |         print("Received KeyboardInterrupt, exiting.")
31 |         sys.exit(0)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     main()
36 | 


--------------------------------------------------------------------------------
/python/tests/test_alloc.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | 
 9 | from unittest import IsolatedAsyncioTestCase
10 | 
11 | from monarch import ProcessAllocator
12 | from monarch._rust_bindings.monarch_hyperactor.alloc import (  # @manual=//monarch/monarch_extension:monarch_extension
13 |     AllocConstraints,
14 |     AllocSpec,
15 | )
16 | 
17 | 
18 | class TestAlloc(IsolatedAsyncioTestCase):
19 |     async def test_basic(self) -> None:
20 |         cmd = "echo hello"
21 |         allocator = ProcessAllocator(cmd)
22 |         spec = AllocSpec(AllocConstraints(), replica=2)
23 |         alloc = await allocator.allocate(spec)
24 | 
25 |         print(alloc)
26 | 


--------------------------------------------------------------------------------
/python/tests/test_sim_backend.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-unsafe
 8 | 
 9 | from contextlib import contextmanager
10 | from typing import Generator, Optional
11 | from unittest import TestCase
12 | 
13 | import pytest
14 | 
15 | import torch
16 | from monarch import fetch_shard
17 | from monarch.common.device_mesh import DeviceMesh
18 | from monarch.sim_mesh import sim_mesh
19 | 
20 | 
21 | @contextmanager
22 | def local_sim_mesh(
23 |     hosts: int = 1,
24 |     # TODO: support multiple gpus in a mesh.
25 |     gpu_per_host: int = 1,
26 |     activate: bool = True,
27 | ) -> Generator[DeviceMesh, None, None]:
28 |     dms = sim_mesh(n_meshes=1, hosts=hosts, gpus_per_host=gpu_per_host)
29 |     dm = dms[0]
30 |     try:
31 |         if activate:
32 |             with dm.activate():
33 |                 yield dm
34 |         else:
35 |             yield dm
36 |         dm.exit()
37 |     except Exception:
38 |         dm.client._shutdown = True
39 |         raise
40 | 
41 | 
42 | # oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
43 | @pytest.mark.oss_skip
44 | class TestSimBackend(TestCase):
45 |     def test_local_mesh_setup(self):
46 |         with local_sim_mesh():
47 |             t = torch.zeros(3, 4)
48 |             t.add_(1)
49 |             local_t = fetch_shard(t).result()
50 |         # consider support specifying the return value in the mock worker.
51 |         assert local_t is not None
52 | 


--------------------------------------------------------------------------------
/python/tests/tools/test_network.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | 
 9 | import socket
10 | import unittest
11 | from unittest import mock
12 | 
13 | from monarch.tools import network
14 | 
15 | 
16 | class TestNetwork(unittest.TestCase):
17 |     def test_network_ipv4_fallback(self) -> None:
18 |         with mock.patch(
19 |             "socket.getaddrinfo",
20 |             side_effect=[
21 |                 socket.gaierror,
22 |                 [
23 |                     (
24 |                         socket.AF_INET,
25 |                         socket.SOCK_STREAM,
26 |                         socket.IPPROTO_TCP,
27 |                         "",
28 |                         ("123.45.67.89", 8080),
29 |                     )
30 |                 ],
31 |             ],
32 |         ):
33 |             self.assertEqual(
34 |                 "123.45.67.89:8080", network.get_sockaddr("foo.bar.facebook.com", 8080)
35 |             )
36 | 
37 |     def test_network_ipv6(self) -> None:
38 |         with mock.patch(
39 |             "socket.getaddrinfo",
40 |             return_value=(
41 |                 [
42 |                     (
43 |                         socket.AF_INET6,
44 |                         socket.SOCK_STREAM,
45 |                         socket.IPPROTO_TCP,
46 |                         "",
47 |                         ("1234:ab00:567c:89d:abcd:0:328:0", 0, 0, 0),
48 |                     )
49 |                 ]
50 |             ),
51 |         ):
52 |             self.assertEqual(
53 |                 "[1234:ab00:567c:89d:abcd:0:328:0]:8080",
54 |                 network.get_sockaddr("foo.bar.facebook.com", 8080),
55 |             )
56 | 
57 |     def test_network(self) -> None:
58 |         # since we patched `socket.getaddrinfo` above
59 |         # don't patch and just make sure things don't error out
60 |         self.assertIsNotNone(network.get_sockaddr(socket.getfqdn(), 8080))
61 | 


--------------------------------------------------------------------------------
/python/tests/tools/test_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | import os
 9 | import unittest
10 | from unittest import mock
11 | 
12 | from monarch.tools.utils import conda
13 | 
14 | 
15 | class TestCondaUtils(unittest.TestCase):
16 |     def test_conda_active_env_name(self) -> None:
17 |         with mock.patch.dict(os.environ, {"CONDA_DEFAULT_ENV": "foo-py3"}, clear=True):
18 |             self.assertEqual(conda.active_env_name(), "foo-py3")
19 | 
20 |         with mock.patch.dict(
21 |             os.environ, {"CONDA_PREFIX": "/home/USER/.conda/envs/bar-py3"}, clear=True
22 |         ):
23 |             self.assertEqual(conda.active_env_name(), "bar-py3")
24 | 
25 |         with mock.patch.dict(os.environ, {}, clear=True):
26 |             self.assertIsNone(conda.active_env_name())
27 | 
28 |     def test_conda_active_env_dir(self) -> None:
29 |         with mock.patch.dict(
30 |             os.environ, {"CONDA_PREFIX": "/home/USER/.conda/envs/foo"}, clear=True
31 |         ):
32 |             self.assertEqual(conda.active_env_dir(), "/home/USER/.conda/envs/foo")
33 | 
34 |         with mock.patch.dict(os.environ, {}, clear=True):
35 |             self.assertIsNone(conda.active_env_dir())
36 | 


--------------------------------------------------------------------------------
/python/tests/tools/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pyre-strict
 8 | import contextlib
 9 | import io
10 | from typing import Generator
11 | 
12 | 
13 | @contextlib.contextmanager
14 | def capture_stdout() -> Generator[io.StringIO, None, None]:
15 |     with io.StringIO() as buf, contextlib.redirect_stdout(buf):
16 |         yield buf
17 | 


--------------------------------------------------------------------------------
/rdmacore-sys/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "rdmacore-sys"
 3 | version = "0.0.0"
 4 | authors = ["Facebook"]
 5 | edition = "2021"
 6 | license = "MIT"
 7 | links = "ibverbs"
 8 | 
 9 | [dependencies]
10 | cxx = "1.0.119"
11 | serde = { version = "1.0.185", features = ["derive", "rc"] }
12 | 
13 | [build-dependencies]
14 | bindgen = "0.70.1"
15 | 


--------------------------------------------------------------------------------
/rdmacore-sys/src/wrapper.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #include <infiniband/mlx5dv.h>
10 | #include <infiniband/verbs.h>
11 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch
 2 | pyzmq
 3 | requests
 4 | numpy
 5 | pyre-extensions
 6 | cloudpickle
 7 | torchx-nightly
 8 | lark
 9 | tabulate
10 | 


--------------------------------------------------------------------------------
/rust-toolchain:
--------------------------------------------------------------------------------
1 | # @rustc_version: rustc 1.87.0-nightly (920d95eaf 2025-03-28)
2 | [toolchain]
3 | channel = "nightly-2025-05-09"
4 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
 1 | # Get help on options with `rustfmt --help=config`
 2 | # Please keep these in alphabetical order.
 3 | edition = "2021"
 4 | format_code_in_doc_comments = true
 5 | group_imports = "StdExternalCrate"
 6 | imports_granularity = "Item"
 7 | merge_derives = false
 8 | style_edition = "2024"
 9 | use_field_init_shorthand = true
10 | 


--------------------------------------------------------------------------------
/timed_test/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # @generated by autocargo from //monarch/timed_test:[timed_test,timed_test_test]
 2 | 
 3 | [package]
 4 | name = "timed_test"
 5 | version = "0.0.0"
 6 | authors = ["Meta"]
 7 | edition = "2021"
 8 | license = "BSD-3-Clause"
 9 | 
10 | [lib]
11 | test = false
12 | doctest = false
13 | proc-macro = true
14 | 
15 | [[test]]
16 | name = "timed_test_test"
17 | path = "tests/basic.rs"
18 | 
19 | [dependencies]
20 | quote = "1.0.29"
21 | syn = { version = "2.0.101", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
22 | 
23 | [dev-dependencies]
24 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] }
25 | 


--------------------------------------------------------------------------------
/timed_test/tests/basic.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | use timed_test::async_timed_test;
10 | 
11 | #[async_timed_test(timeout_secs = 5)]
12 | async fn good() {
13 |     #[allow(clippy::disallowed_methods)]
14 |     tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
15 | }
16 | 
17 | #[async_timed_test(timeout_secs = 1)]
18 | #[should_panic]
19 | async fn bad() {
20 |     #[allow(clippy::disallowed_methods)]
21 |     tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
22 | }
23 | 
24 | #[async_timed_test(timeout_secs = 1)]
25 | #[should_panic]
26 | async fn very_bad() {
27 |     loop {
28 |         #[allow(clippy::disallowed_methods)]
29 |         std::thread::sleep(std::time::Duration::from_secs(1));
30 |     }
31 | }
32 | 
33 | #[async_timed_test(timeout_secs = 60)]
34 | #[should_panic]
35 | async fn panics_correctly() {
36 |     panic!();
37 | }
38 | 


--------------------------------------------------------------------------------
/tools/rust/ossconfigs/clippy.toml:
--------------------------------------------------------------------------------
1 | disallowed-methods = [
2 |     { path = "tokio::time::sleep", reason = "use `hyperactor::clock::Clock::sleep` instead." },
3 |     { path = "std::thread::sleep", reason = "use `hyperactor::clock::Clock::sleep` instead." },
4 |     { path = "tokio::time::Instant::now", reason = "use `hyperactor::clock::Clock::now` instead." },
5 |     { path = "std::time::SystemTime::now", reason = "use `hyperactor::clock::Clock::system_time_now` instead." },
6 |     { path = "tokio::time::timeout", reason = "use `hyperactor::clock::Clock::timeout` instead." },
7 | ]
8 | 


--------------------------------------------------------------------------------
/torch-sys-cuda/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # @generated by autocargo from //monarch/torch-sys-cuda:torch-sys-cuda
 2 | 
 3 | [package]
 4 | name = "torch-sys-cuda"
 5 | version = "0.0.0"
 6 | authors = ["Meta"]
 7 | edition = "2021"
 8 | license = "BSD-3-Clause"
 9 | links = "torch_cuda"
10 | 
11 | [dependencies]
12 | cxx = "1.0.119"
13 | derive_more = { version = "1.0.0", features = ["full"] }
14 | fxhash = "0.2.1"
15 | nccl-sys = { path = "../nccl-sys" }
16 | serde = { version = "1.0.185", features = ["derive", "rc"] }
17 | thiserror = "2.0.12"
18 | torch-sys = { version = "0.0.0", path = "../torch-sys" }
19 | 
20 | [build-dependencies]
21 | cxx-build = "1.0.119"
22 | pyo3-build-config = "0.24.2"
23 | 
24 | [features]
25 | cuda = []
26 | default = ["cuda"]
27 | 


--------------------------------------------------------------------------------
/torch-sys-cuda/src/bridge.cpp:
--------------------------------------------------------------------------------
 1 | #include "monarch/torch-sys-cuda/src/bridge.h"
 2 | 
 3 | namespace monarch {
 4 | std::unique_ptr<at::cuda::CUDAEvent>
 5 | create_cuda_event(bool enable_timing, bool blocking, bool interprocess) {
 6 |   unsigned int flags = (blocking ? cudaEventBlockingSync : cudaEventDefault) |
 7 |       (enable_timing ? cudaEventDefault : cudaEventDisableTiming) |
 8 |       (interprocess ? cudaEventInterprocess : cudaEventDefault);
 9 | 
10 |   return std::make_unique<at::cuda::CUDAEvent>(flags);
11 | }
12 | 
13 | std::shared_ptr<c10::cuda::CUDAStream> get_current_stream(
14 |     c10::DeviceIndex device) {
15 |   return std::make_shared<c10::cuda::CUDAStream>(
16 |       c10::cuda::getCurrentCUDAStream(device));
17 | }
18 | 
19 | std::shared_ptr<c10::cuda::CUDAStream> create_stream(
20 |     c10::DeviceIndex device,
21 |     int32_t priority) {
22 |   return std::make_shared<c10::cuda::CUDAStream>(
23 |       c10::cuda::getStreamFromPool((const int)priority, device));
24 | }
25 | 
26 | void set_current_stream(const c10::cuda::CUDAStream& stream) {
27 |   auto device = c10::cuda::current_device();
28 |   if (device != stream.device_index()) {
29 |     c10::cuda::set_device(stream.device_index());
30 |   }
31 |   at::cuda::setCurrentCUDAStream(stream);
32 | }
33 | } // namespace monarch
34 | 


--------------------------------------------------------------------------------
/torch-sys-cuda/src/bridge.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #pragma once
10 | 
11 | #include <ATen/cuda/CUDAEvent.h> // @manual=//caffe2:torch-cpp
12 | #include <nccl.h> // @manual
13 | #include <torch/torch.h> // @manual=//caffe2:torch-cpp
14 | 
15 | namespace monarch {
16 | 
17 | std::unique_ptr<at::cuda::CUDAEvent>
18 | create_cuda_event(bool enable_timing, bool blocking, bool interprocess);
19 | 
20 | std::shared_ptr<c10::cuda::CUDAStream> get_current_stream(
21 |     c10::DeviceIndex device);
22 | 
23 | std::shared_ptr<c10::cuda::CUDAStream> create_stream(
24 |     c10::DeviceIndex device,
25 |     int32_t priority);
26 | 
27 | void set_current_stream(const c10::cuda::CUDAStream& stream);
28 | 
29 | /// This function exists because ncclConfig initialization requires the use of
30 | /// a macro. We cannot reference the macro directly from Rust code, so we wrap
31 | /// the macro use in a function and bind that to Rust instead.
32 | inline ncclConfig_t make_nccl_config() {
33 |   ncclConfig_t ret = NCCL_CONFIG_INITIALIZER;
34 |   return ret;
35 | }
36 | 
37 | } // namespace monarch
38 | 


--------------------------------------------------------------------------------
/torch-sys-cuda/src/lib.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | /// A companion to the `torch-sys` crate that provides bindings for
10 | /// CUDA-specific functionality from libtorch. This crate is separated out to
11 | /// make it easier for clients who want to avoid compiling CUDA code.
12 | ///
13 | /// The same safety logic described in the `torch-sys` crate applies here.
14 | mod bridge;
15 | pub mod cuda;
16 | pub mod nccl;
17 | 


--------------------------------------------------------------------------------
/torch-sys/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # @generated by autocargo from //monarch/torch-sys:torch-sys
 2 | 
 3 | [package]
 4 | name = "torch-sys"
 5 | version = "0.0.0"
 6 | authors = ["Meta"]
 7 | edition = "2021"
 8 | license = "BSD-3-Clause"
 9 | links = "torch"
10 | 
11 | [dependencies]
12 | anyhow = "1.0.98"
13 | async-trait = "0.1.86"
14 | atomic_refcell = "0.1.13"
15 | bincode = "1.3.3"
16 | cxx = "1.0.119"
17 | derive_more = { version = "1.0.0", features = ["full"] }
18 | monarch_types = { version = "0.0.0", path = "../monarch_types" }
19 | nccl-sys = { path = "../nccl-sys", optional = true }
20 | paste = "1.0.14"
21 | pyo3 = { version = "0.24", features = ["anyhow", "multiple-pymethods"] }
22 | regex = "1.11.1"
23 | serde = { version = "1.0.185", features = ["derive", "rc"] }
24 | thiserror = "2.0.12"
25 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] }
26 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
27 | 
28 | [build-dependencies]
29 | bindgen = "0.70.1"
30 | cxx-build = "1.0.119"
31 | pyo3-build-config = "0.24.2"
32 | 
33 | [features]
34 | cuda = ["dep:nccl-sys"]
35 | 


--------------------------------------------------------------------------------
/torch-sys/README.md:
--------------------------------------------------------------------------------
 1 | # Documentation
 2 | 
 3 | See the source documentation or `bunnylol rustdoc torch-sys` to see docs.
 4 | 
 5 | # Cargo build
 6 | 
 7 | The cargo build requires that you have a version of PyTorch installed in your
 8 | Python environment. To get set up, run the following on your devgpu:
 9 | 
10 | ```sh
11 | # get conda on devserver
12 | sudo feature install genai_conda
13 | 
14 | # Set up conda env
15 | conda create -n monarch
16 | conda activate monarch
17 | 
18 | # install pytorch
19 | conda install pytorch pytorch-cuda=12.4 -c pytorch -c nvidia
20 | 
21 | # install cuda toolkit on devserver (requires devgpu)
22 | sudo dnf install cuda-12-0
23 | 
24 | # install nccl on devserver (requires devgpu)
25 | sudo dnf install libnccl-devel
26 | 
27 | # install libclang on devserver (needed for rust-bindgen)
28 | sudo dnf install clang-devel
29 | 
30 | # in monarch/torch-sys
31 | cargo test
32 | ```
33 | 


--------------------------------------------------------------------------------
/torch-sys/src/bindings.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
10 | 


--------------------------------------------------------------------------------
/torch-sys/src/pyobject.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | use cxx::type_id;
10 | use pyo3::prelude::*;
11 | 
12 | #[repr(transparent)]
13 | pub(crate) struct FFIPyObject(*mut pyo3::ffi::PyObject);
14 | 
15 | // SAFETY: This is just a pointer to a PyObject and the pointer is
16 | // never dereferenced directly. It can only be converted to pyo3::PyObject
17 | // and then dereferenced through that. PyO3 manages the access patterns to
18 | // the underlying PyObject.
19 | // Additionally, we make the assumption that ownership of the underlying
20 | // PyObject is transferred with the it.
21 | // Hence FFIPyObject should always be created from an owned pointer.
22 | unsafe impl cxx::ExternType for FFIPyObject {
23 |     type Id = type_id!("monarch::FFIPyObject");
24 |     type Kind = cxx::kind::Trivial;
25 | }
26 | 
27 | impl<T> From<Py<T>> for FFIPyObject {
28 |     #[inline]
29 |     fn from(obj: Py<T>) -> Self {
30 |         Self(obj.into_ptr())
31 |     }
32 | }
33 | 
34 | impl<T> From<Bound<'_, T>> for FFIPyObject {
35 |     #[inline]
36 |     fn from(obj: Bound<'_, T>) -> Self {
37 |         Self(obj.into_ptr())
38 |     }
39 | }
40 | 
41 | impl<T> From<&Bound<'_, T>> for FFIPyObject {
42 |     #[inline]
43 |     fn from(obj: &Bound<'_, T>) -> Self {
44 |         Self(obj.clone().into_ptr())
45 |     }
46 | }
47 | 
48 | impl<'py> IntoPyObject<'py> for FFIPyObject {
49 |     type Target = PyAny;
50 |     type Output = Bound<'py, Self::Target>;
51 |     type Error = PyErr;
52 | 
53 |     fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
54 |         // SAFETY: Pull in the `PyObject` from C/C++.
55 |         Ok(unsafe { PyObject::from_owned_ptr(py, self.0) }.into_bound(py))
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/torch-sys/src/torch.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Meta Platforms, Inc. and affiliates.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the BSD-style license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #pragma once
10 | 
11 | // Going for a smaller set of headers until more enums are needed
12 | #include <ATen/core/Scalar.h>
13 | #include <c10/core/Layout.h>
14 | #include <c10/core/MemoryFormat.h>
15 | 


--------------------------------------------------------------------------------