├── .dockerignore ├── .flake8 ├── .github └── workflows │ ├── build-cpu.yml │ ├── build-cuda.yml │ ├── ci.yml │ ├── test-cpu.yml │ ├── test-cuda.yml │ └── wheels.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Cargo.toml ├── Dockerfile ├── LICENSE ├── README.md ├── books └── hyperactor-book │ ├── .gitignore │ ├── README.md │ ├── book.toml │ └── src │ ├── SUMMARY.md │ ├── actors │ ├── actor.md │ ├── actor_handle.md │ ├── actor_lifecycle.md │ ├── binds.md │ ├── checkpointable.md │ ├── handler.md │ ├── index.md │ ├── remotable_actor.md │ ├── remote_actor.md │ └── remote_handles.md │ ├── introduction.md │ ├── macros │ ├── export.md │ ├── forward.md │ ├── handle_client.md │ ├── handler.md │ ├── index.md │ ├── named.md │ └── ref_client.md │ ├── mailboxes │ ├── delivery.md │ ├── index.md │ ├── mailbox.md │ ├── mailbox_client.md │ ├── mailbox_sender.md │ ├── mailbox_server.md │ ├── multiplexer.md │ ├── ports.md │ ├── reconfigurable_sender.md │ └── routers.md │ └── references │ ├── actor_id.md │ ├── bindings.md │ ├── gang_id.md │ ├── gangs.md │ ├── index.md │ ├── port_id.md │ ├── proc_id.md │ ├── reference.md │ ├── syntax.md │ ├── typed_refs.md │ └── world_id.md ├── build-requirements.txt ├── clippy.toml ├── controller ├── Cargo.toml ├── build.rs └── src │ ├── bootstrap.rs │ ├── history.rs │ ├── lib.rs │ └── main.rs ├── cuda-sys ├── Cargo.toml ├── build.rs └── src │ ├── lib.rs │ └── wrapper.h ├── examples ├── __init__.py ├── grpo_actor.py └── notebooks │ ├── README.md │ ├── ping_pong.ipynb │ └── spmd_ddp.ipynb ├── hyper ├── Cargo.toml ├── src │ ├── commands.rs │ ├── commands │ │ ├── demo.rs │ │ ├── procs.rs │ │ ├── serve.rs │ │ ├── show.rs │ │ └── top.rs │ ├── lib.rs │ ├── main.rs │ ├── tui │ │ ├── mod.rs │ │ └── top.rs │ └── utils │ │ ├── mod.rs │ │ └── system_address.rs └── tests │ └── demo_test.py ├── hyperactor ├── Cargo.toml ├── example │ └── derive.rs └── src │ ├── accum.rs │ ├── actor.rs │ ├── actor │ └── remote.rs │ ├── attrs.rs │ ├── cap.rs │ ├── channel.rs │ ├── channel │ ├── local.rs │ ├── net.rs │ └── sim.rs │ ├── checkpoint.rs │ ├── clock.rs │ ├── config.rs │ ├── data.rs │ ├── init.rs │ ├── lib.rs │ ├── mailbox.rs │ ├── mailbox │ ├── durable_mailbox_sender.rs │ ├── mailbox_admin_message.rs │ └── undeliverable.rs │ ├── message.rs │ ├── metrics.rs │ ├── panic_handler.rs │ ├── parse.rs │ ├── proc.rs │ ├── reference.rs │ ├── simnet.rs │ ├── spawn.rs │ ├── supervision.rs │ ├── sync.rs │ ├── sync │ ├── flag.rs │ └── monitor.rs │ ├── test_utils.rs │ └── test_utils │ ├── pingpong.rs │ ├── proc_supervison.rs │ └── process_assertion.rs ├── hyperactor_macros ├── Cargo.toml ├── build.rs ├── src │ └── lib.rs └── tests │ ├── basic.rs │ ├── castable.rs │ └── export.rs ├── hyperactor_mesh ├── Cargo.toml ├── examples │ ├── dining_philosophers.rs │ └── sieve.rs ├── src │ ├── actor_mesh.rs │ ├── alloc.rs │ ├── alloc │ │ ├── local.rs │ │ ├── logtailer.rs │ │ ├── process.rs │ │ ├── remoteprocess.rs │ │ └── sim.rs │ ├── assign.rs │ ├── bootstrap.rs │ ├── comm.rs │ ├── comm │ │ └── multicast.rs │ ├── connect.rs │ ├── lib.rs │ ├── logging.rs │ ├── mesh.rs │ ├── mesh_selection.rs │ ├── metrics.rs │ ├── proc_mesh.rs │ ├── proc_mesh │ │ └── mesh_agent.rs │ ├── reference.rs │ ├── shared_cell.rs │ ├── shortuuid.rs │ └── test_utils.rs └── test │ ├── bootstrap.rs │ └── process_allocator_cleanup │ ├── process_allocator_cleanup.rs │ ├── process_allocator_test_bin.rs │ └── process_allocator_test_bootstrap.rs ├── hyperactor_mesh_macros ├── Cargo.toml └── src │ └── lib.rs ├── hyperactor_multiprocess ├── Cargo.toml └── src │ ├── lib.rs │ ├── ping_pong.rs │ ├── proc_actor.rs │ ├── pyspy.rs │ ├── scheduler.rs │ ├── supervision.rs │ ├── system.rs │ └── system_actor.rs ├── hyperactor_telemetry ├── Cargo.toml ├── src │ ├── lib.rs │ ├── otel.rs │ ├── pool.rs │ ├── recorder.rs │ └── spool.rs ├── stubs │ ├── fbinit │ │ └── src │ │ │ └── lib.rs │ └── scuba │ │ └── src │ │ └── lib.rs └── tester │ ├── Cargo.toml │ └── main.rs ├── monarch_extension ├── Cargo.toml ├── build.rs └── src │ ├── blocking.rs │ ├── client.rs │ ├── code_sync.rs │ ├── controller.rs │ ├── convert.rs │ ├── debugger.rs │ ├── lib.rs │ ├── logging.rs │ ├── mesh_controller.rs │ ├── panic.rs │ ├── simulation_tools.rs │ ├── simulator_client.rs │ └── tensor_worker.rs ├── monarch_hyperactor ├── Cargo.toml └── src │ ├── actor.rs │ ├── actor_mesh.rs │ ├── alloc.rs │ ├── bin │ └── process_allocator │ │ ├── common.rs │ │ └── main.rs │ ├── bootstrap.rs │ ├── channel.rs │ ├── code_sync.rs │ ├── code_sync │ ├── manager.rs │ ├── rsync.rs │ └── workspace.rs │ ├── config.rs │ ├── lib.rs │ ├── local_state_broker.rs │ ├── mailbox.rs │ ├── ndslice.rs │ ├── proc.rs │ ├── proc_mesh.rs │ ├── runtime.rs │ ├── selection.rs │ ├── shape.rs │ ├── supervision.rs │ └── telemetry.rs ├── monarch_messages ├── Cargo.toml ├── build.rs ├── src │ ├── client.rs │ ├── controller.rs │ ├── debugger.rs │ ├── lib.rs │ ├── wire_value.rs │ └── worker.rs └── test_utils.py ├── monarch_rdma ├── Cargo.toml ├── examples │ ├── Cargo.toml │ ├── bootstrap.rs │ ├── main.rs │ └── parameter_server.rs ├── extension │ ├── Cargo.toml │ └── lib.rs └── src │ ├── ibverbs_primitives.rs │ ├── lib.rs │ ├── macros.rs │ ├── rdma_components.rs │ ├── rdma_manager_actor.rs │ └── test_utils.rs ├── monarch_simulator ├── Cargo.toml └── src │ ├── bootstrap.rs │ ├── collective_coordinator.rs │ ├── controller.rs │ ├── lib.rs │ ├── simulator.rs │ └── worker.rs ├── monarch_tensor_worker ├── Cargo.toml ├── build.rs ├── src │ ├── bootstrap.rs │ ├── borrow.rs │ ├── comm.rs │ ├── device_mesh.rs │ ├── lib.rs │ ├── pipe.rs │ ├── py_pipe.rs │ ├── stream.rs │ └── test_util.rs ├── test_utils.py └── test_worker_main.py ├── monarch_types ├── Cargo.toml └── src │ ├── lib.rs │ ├── pyobject.rs │ ├── python.rs │ └── pytree.rs ├── nccl-sys ├── Cargo.toml ├── build.rs └── src │ ├── lib.rs │ └── nccl.h ├── ndslice ├── Cargo.toml └── src │ ├── lib.rs │ ├── reshape.rs │ ├── selection.rs │ ├── selection │ ├── normal.rs │ ├── parse.rs │ ├── pretty.rs │ ├── routing.rs │ ├── test_utils.rs │ └── token_parser.rs │ ├── shape.rs │ ├── slice.rs │ ├── strategy.rs │ └── utils.rs ├── preempt_rwlock ├── Cargo.toml └── src │ └── lib.rs ├── pyproject.toml ├── python ├── monarch │ ├── __init__.py │ ├── _rust_bindings │ │ ├── __init__.pyi │ │ ├── controller │ │ │ └── bootstrap.pyi │ │ ├── monarch_extension │ │ │ ├── __init__.pyi │ │ │ ├── blocking.pyi │ │ │ ├── client.pyi │ │ │ ├── code_sync.pyi │ │ │ ├── controller.pyi │ │ │ ├── debugger.pyi │ │ │ ├── logging.pyi │ │ │ ├── mesh_controller.pyi │ │ │ ├── panic.pyi │ │ │ ├── simulation_tools.pyi │ │ │ ├── simulator_client.pyi │ │ │ └── tensor_worker.pyi │ │ ├── monarch_hyperactor │ │ │ ├── actor.pyi │ │ │ ├── actor_mesh.pyi │ │ │ ├── alloc.pyi │ │ │ ├── bootstrap.pyi │ │ │ ├── channel.pyi │ │ │ ├── mailbox.pyi │ │ │ ├── proc.pyi │ │ │ ├── proc_mesh.pyi │ │ │ ├── runtime.pyi │ │ │ ├── selection.pyi │ │ │ ├── shape.pyi │ │ │ ├── supervision.pyi │ │ │ └── telemetry.pyi │ │ ├── monarch_messages │ │ │ └── debugger.pyi │ │ ├── monarch_tensor_worker │ │ │ └── bootstrap.pyi │ │ ├── old.pyi │ │ └── rdma │ │ │ └── __init__.pyi │ ├── _src │ │ ├── __init__.py │ │ ├── actor │ │ │ ├── __init__.py │ │ │ ├── actor_mesh.py │ │ │ ├── allocator.py │ │ │ ├── bootstrap_main.py │ │ │ ├── code_sync │ │ │ │ ├── __init__.py │ │ │ │ └── auto_reload.py │ │ │ ├── debugger.py │ │ │ ├── device_utils.py │ │ │ ├── future.py │ │ │ ├── pdb_wrapper.py │ │ │ ├── pickle.py │ │ │ ├── proc_mesh.py │ │ │ ├── shape.py │ │ │ └── telemetry │ │ │ │ ├── __init__.py │ │ │ │ └── rust_span_tracing.py │ │ └── tensor_engine │ │ │ ├── __init__.py │ │ │ └── rdma.py │ ├── _testing.py │ ├── actor │ │ └── __init__.py │ ├── actor_mesh.py │ ├── bootstrap_main.py │ ├── builtins │ │ ├── __init__.py │ │ ├── log.py │ │ └── random.py │ ├── cached_remote_function.py │ ├── common │ │ ├── _C.pyi │ │ ├── __init__.py │ │ ├── _coalescing.py │ │ ├── _tensor_to_table.py │ │ ├── base_tensor.py │ │ ├── borrows.py │ │ ├── client.py │ │ ├── constants.py │ │ ├── context_manager.py │ │ ├── controller_api.py │ │ ├── device_mesh.py │ │ ├── fake.py │ │ ├── function.py │ │ ├── function_caching.py │ │ ├── future.py │ │ ├── init.cpp │ │ ├── invocation.py │ │ ├── mast.py │ │ ├── messages.py │ │ ├── mock_cuda.cpp │ │ ├── mock_cuda.h │ │ ├── mock_cuda.py │ │ ├── opaque_ref.py │ │ ├── pipe.py │ │ ├── process_group.py │ │ ├── recording.py │ │ ├── reference.py │ │ ├── remote.py │ │ ├── selection.py │ │ ├── stream.py │ │ ├── tensor.py │ │ ├── tensor_factory.py │ │ └── tree.py │ ├── controller │ │ ├── __init__.py │ │ ├── backend.py │ │ ├── controller.py │ │ ├── debugger.py │ │ ├── history.py │ │ └── rust_backend │ │ │ ├── __init__.py │ │ │ └── controller.py │ ├── fetch.py │ ├── gradient │ │ ├── __init__.py │ │ ├── _gradient_generator.cpp │ │ └── _gradient_generator.pyi │ ├── gradient_generator.py │ ├── memory.py │ ├── mesh_controller.py │ ├── notebook.py │ ├── opaque_module.py │ ├── opaque_object.py │ ├── parallel │ │ ├── __init__.py │ │ └── pipelining │ │ │ ├── __init__.py │ │ │ ├── runtime.py │ │ │ ├── schedule_ir.py │ │ │ └── scheduler.py │ ├── proc_mesh.py │ ├── profiler.py │ ├── python_local_mesh.py │ ├── random.py │ ├── rdma.py │ ├── remote_class.py │ ├── rust_backend_mesh.py │ ├── rust_local_mesh.py │ ├── sim_mesh.py │ ├── simulator │ │ ├── README.md │ │ ├── __init__.py │ │ ├── command_history.py │ │ ├── config.py │ │ ├── interface.py │ │ ├── ir.py │ │ ├── mock_controller.py │ │ ├── profiling.py │ │ ├── simulator.py │ │ ├── task.py │ │ ├── tensor.py │ │ ├── trace.py │ │ ├── utils.py │ │ └── worker.py │ ├── tensor_worker_main.py │ ├── tensorboard.py │ ├── timer │ │ ├── README.md │ │ ├── __init__.py │ │ ├── example_monarch.py │ │ ├── example_spmd.py │ │ ├── execution_timer.py │ │ └── execution_timer_test.py │ ├── tools │ │ ├── __init__.py │ │ ├── cli.py │ │ ├── commands.py │ │ ├── components │ │ │ ├── __init__.py │ │ │ └── hyperactor.py │ │ ├── config │ │ │ ├── __init__.py │ │ │ └── defaults.py │ │ ├── mesh_spec.py │ │ ├── network.py │ │ └── utils.py │ ├── worker │ │ ├── __init__.py │ │ ├── _testing_function.py │ │ ├── compiled_block.py │ │ ├── debugger.py │ │ ├── lines.py │ │ ├── monitor.py │ │ └── worker.py │ └── world_mesh.py ├── monarch_supervisor │ ├── README.md │ ├── __init__.py │ ├── _testing.py │ ├── diagram.png │ ├── function_call.py │ ├── host.py │ ├── launchers.py │ ├── log_pstree.py │ ├── logging.py │ ├── python_executable.py │ └── worker │ │ └── worker_env.py └── tests │ ├── __init__.py │ ├── _monarch │ ├── test_actor.py │ ├── test_actor_mesh.py │ ├── test_client.py │ ├── test_controller.py │ ├── test_hyperactor.py │ ├── test_mailbox.py │ ├── test_ndslice.py │ └── test_worker.py │ ├── builtins │ ├── test_log.py │ └── test_random.py │ ├── code_sync │ └── test_auto_reload.py │ ├── dispatch_bench.py │ ├── dispatch_bench_helper.py │ ├── error_test_binary.py │ ├── requirements.txt │ ├── simulator │ ├── __init__.py │ ├── test_profiling.py │ ├── test_simulator.py │ ├── test_task.py │ └── test_worker.py │ ├── sleep_binary.py │ ├── test_actor_error.py │ ├── test_alloc.py │ ├── test_allocator.py │ ├── test_coalescing.py │ ├── test_controller.py │ ├── test_debugger.py │ ├── test_device_mesh.py │ ├── test_fault_tolerance.py │ ├── test_future.py │ ├── test_grad_generator.py │ ├── test_mock_cuda.py │ ├── test_pdb_actor.py │ ├── test_python_actors.py │ ├── test_remote_functions.py │ ├── test_rust_backend.py │ ├── test_signal_safe_block_on.py │ ├── test_sim_backend.py │ ├── test_tensor_engine.py │ └── tools │ ├── config │ └── test_defaults.py │ ├── test_cli.py │ ├── test_commands.py │ ├── test_mesh_spec.py │ ├── test_network.py │ ├── test_utils.py │ └── utils.py ├── rdmacore-sys ├── Cargo.toml ├── build.rs └── src │ ├── lib.rs │ └── wrapper.h ├── requirements.txt ├── rust-toolchain ├── rustfmt.toml ├── scripts └── common-setup.sh ├── setup.py ├── timed_test ├── Cargo.toml ├── src │ └── lib.rs └── tests │ └── basic.rs ├── tools └── rust │ └── ossconfigs │ └── clippy.toml ├── torch-sys-cuda ├── Cargo.toml ├── build.rs └── src │ ├── bridge.cpp │ ├── bridge.h │ ├── bridge.rs │ ├── cuda.rs │ ├── lib.rs │ └── nccl.rs └── torch-sys ├── Cargo.toml ├── README.md ├── build.rs └── src ├── backend.rs ├── bindings.rs ├── borrow.rs ├── bridge.cpp ├── bridge.h ├── bridge.rs ├── call_op.rs ├── cell.rs ├── device.rs ├── ivalue.rs ├── layout.rs ├── lib.rs ├── memory_format.rs ├── pyobject.rs ├── rvalue.rs ├── scalar_type.rs ├── tensor.rs └── torch.hpp /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .github 3 | .pyre 4 | docs 5 | *_meta/** 6 | **/*_meta/** 7 | **/*_meta.rs 8 | **/meta/** 9 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 256 3 | extend-ignore = E302, G004, SIM105, G201, SIM115, SIM904 4 | -------------------------------------------------------------------------------- /.github/workflows/build-cpu.yml: -------------------------------------------------------------------------------- 1 | name: Build CPU 2 | 3 | on: 4 | workflow_call: 5 | 6 | concurrency: 7 | group: build-cpu-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} 8 | cancel-in-progress: true 9 | 10 | jobs: 11 | build-cpu: 12 | name: Build CPU - No Tensor Engine 13 | uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main 14 | strategy: 15 | fail-fast: true 16 | matrix: 17 | include: 18 | - name: 4xlarge 19 | runs-on: linux.4xlarge 20 | with: 21 | timeout: 60 22 | runner: ${{ matrix.runs-on }} 23 | submodules: recursive 24 | upload-artifact: monarch-cpu-${{ github.sha }} 25 | script: | 26 | # Source common setup functions 27 | source scripts/common-setup.sh 28 | 29 | # Setup build environment (conda + system deps + rust + build deps) 30 | setup_build_environment 31 | 32 | # Build monarch (No tensor engine, CPU version) 33 | USE_TENSOR_ENGINE=0 python setup.py bdist_wheel 34 | -------------------------------------------------------------------------------- /.github/workflows/build-cuda.yml: -------------------------------------------------------------------------------- 1 | name: Build CUDA 2 | 3 | on: 4 | workflow_call: 5 | 6 | concurrency: 7 | group: build-cuda-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} 8 | cancel-in-progress: true 9 | 10 | jobs: 11 | build-cuda: 12 | name: Build CUDA (cuda12.6-py3.10) 13 | uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main 14 | strategy: 15 | fail-fast: true 16 | matrix: 17 | include: 18 | - name: 4xlargegpu 19 | runs-on: linux.g5.4xlarge.nvidia.gpu 20 | torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126' 21 | gpu-arch-type: "cuda" 22 | gpu-arch-version: "12.6" 23 | with: 24 | timeout: 60 25 | runner: ${{ matrix.runs-on }} 26 | gpu-arch-type: ${{ matrix.gpu-arch-type }} 27 | gpu-arch-version: ${{ matrix.gpu-arch-version }} 28 | submodules: recursive 29 | upload-artifact: monarch-cuda-${{ github.sha }} 30 | script: | 31 | # Source common setup functions 32 | source scripts/common-setup.sh 33 | 34 | # Setup build environment (conda + system deps + rust + build deps) 35 | setup_build_environment 36 | 37 | # Setup Tensor Engine 38 | setup_tensor_engine 39 | 40 | # Build the process allocator binary 41 | build_process_allocator 42 | 43 | # Build monarch (CUDA version) 44 | python setup.py bdist_wheel 45 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | - gh/** 8 | push: 9 | branches: 10 | - main 11 | 12 | concurrency: 13 | group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} 14 | cancel-in-progress: true 15 | 16 | jobs: 17 | build-cuda: 18 | name: Build CUDA 19 | uses: ./.github/workflows/build-cuda.yml 20 | 21 | build-cpu: 22 | name: Build CPU 23 | uses: ./.github/workflows/build-cpu.yml 24 | 25 | test-cuda: 26 | name: Test CUDA 27 | needs: build-cuda 28 | uses: ./.github/workflows/test-cuda.yml 29 | with: 30 | artifact-name: monarch-cuda-${{ github.sha }} 31 | 32 | test-cpu: 33 | name: Test CPU 34 | needs: build-cpu 35 | uses: ./.github/workflows/test-cpu.yml 36 | with: 37 | artifact-name: monarch-cpu-${{ github.sha }} 38 | 39 | status-check: 40 | name: Status Check 41 | runs-on: ubuntu-latest 42 | needs: [test-cuda, test-cpu] 43 | if: always() 44 | steps: 45 | - name: Check all jobs status 46 | run: | 47 | if [[ "${{ needs.test-cuda.result }}" != "success" ]] || 48 | [[ "${{ needs.test-cpu.result }}" != "success" ]]; then 49 | echo "One or more jobs failed" 50 | exit 1 51 | else 52 | echo "All jobs passed" 53 | fi 54 | -------------------------------------------------------------------------------- /.github/workflows/test-cpu.yml: -------------------------------------------------------------------------------- 1 | name: Test CPU 2 | 3 | on: 4 | workflow_call: 5 | inputs: 6 | artifact-name: 7 | description: 'Wheel artifact name from build workflow' 8 | required: true 9 | type: string 10 | 11 | concurrency: 12 | group: test-cpu-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} 13 | cancel-in-progress: true 14 | 15 | jobs: 16 | test-cpu-no-tensor-engine: 17 | name: Test CPU - No Tensor Engine 18 | uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main 19 | with: 20 | timeout: 60 21 | runner: linux.4xlarge 22 | submodules: recursive 23 | download-artifact: ${{ inputs.artifact-name }} 24 | script: | 25 | # Source common setup functions 26 | source scripts/common-setup.sh 27 | 28 | # Setup test environment 29 | setup_conda_environment 30 | 31 | # Disable tensor engine 32 | export USE_TENSOR_ENGINE=0 33 | 34 | # Install the built wheel from artifact 35 | install_wheel_from_artifact 36 | 37 | # Currently a no-op. 38 | # Tests requiring tensor engine / GPU need to be identified and flagged to skip. 39 | # We will just ensure monarch can be imported successfully. 40 | python -c "import monarch; print('Monarch imported successfully')" 41 | -------------------------------------------------------------------------------- /.github/workflows/test-cuda.yml: -------------------------------------------------------------------------------- 1 | name: Test CUDA 2 | 3 | on: 4 | workflow_call: 5 | inputs: 6 | artifact-name: 7 | description: 'Wheel artifact name from build workflow' 8 | required: true 9 | type: string 10 | 11 | concurrency: 12 | group: test-cuda-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} 13 | cancel-in-progress: true 14 | 15 | jobs: 16 | test-cuda: 17 | name: Test CUDA (cuda12.6-py3.10) 18 | uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main 19 | strategy: 20 | fail-fast: true 21 | matrix: 22 | include: 23 | - name: 4xlargegpu 24 | runs-on: linux.g5.4xlarge.nvidia.gpu 25 | torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126' 26 | gpu-arch-type: "cuda" 27 | gpu-arch-version: "12.6" 28 | with: 29 | timeout: 120 30 | runner: ${{ matrix.runs-on }} 31 | gpu-arch-type: ${{ matrix.gpu-arch-type }} 32 | gpu-arch-version: ${{ matrix.gpu-arch-version }} 33 | submodules: recursive 34 | download-artifact: ${{ inputs.artifact-name }} 35 | script: | 36 | # Source common setup functions 37 | source scripts/common-setup.sh 38 | 39 | # Setup test environment 40 | setup_test_environment 41 | 42 | # Install cargo binaries 43 | mkdir cargo_bin && mv ${RUNNER_ARTIFACT_DIR}/cargo_bin/* cargo_bin 44 | chmod +x cargo_bin/process_allocator 45 | export PATH=$(pwd)/cargo_bin:$PATH 46 | 47 | # Setup Tensor Engine dependencies 48 | setup_tensor_engine 49 | 50 | # Install the built wheel from artifact 51 | install_wheel_from_artifact 52 | 53 | # tests the type_assert statements in test_python_actor are correct 54 | # pyre currently does not check these assertions 55 | pyright python/tests/test_python_actors.py 56 | 57 | # Run CUDA tests 58 | LC_ALL=C pytest python/tests/ -s -v -m "not oss_skip" 59 | python python/tests/test_mock_cuda.py 60 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | syntax: glob 2 | 3 | python/**/*.so 4 | python/**/*.json 5 | python/**/*.html 6 | python/**/*.pkl 7 | python/**/__pycache__ 8 | python/monarch.egg-info/* 9 | *.egg 10 | build/* 11 | dist/* 12 | monarch.egg-info/* 13 | python/monarch/monarch_controller 14 | 15 | .ipynb_checkpoints 16 | 17 | # Rust stuff 18 | target/ 19 | Cargo.lock 20 | 21 | # mdbook output 22 | books/hyperactor-book/book/** 23 | 24 | CLAUDE.md 25 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Meta Open Source Projects 2 | 3 | We want to make contributing to this project as easy and transparent as 4 | possible. 5 | 6 | ## Pull Requests 7 | We actively welcome your pull requests. 8 | 9 | Note: pull requests are not imported into the GitHub directory in the usual way. There is an internal Meta repository that is the "source of truth" for the project. The GitHub repository is generated *from* the internal Meta repository. So we don't merge GitHub PRs directly to the GitHub repository -- they must first be imported into internal Meta repository. When Meta employees look at the GitHub PR, there is a special button visible only to them that executes that import. The changes are then automatically reflected from the internal Meta repository back to GitHub. This is why you won't see your PR having being directly merged, but you still see your changes in the repository once it reflects the imported changes. 10 | 11 | 1. Fork the repo and create your branch from `main`. 12 | 2. If you've added code that should be tested, add tests. 13 | 3. If you've changed APIs, update the documentation. 14 | 4. Ensure the test suite passes. 15 | 5. Make sure your code lints. 16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 17 | 18 | ## Contributor License Agreement ("CLA") 19 | In order to accept your pull request, we need you to submit a CLA. You only need 20 | to do this once to work on any of Meta's open source projects. 21 | 22 | Complete your CLA here: 23 | 24 | ## Issues 25 | We use GitHub issues to track public bugs. Please ensure your description is 26 | clear and has sufficient instructions to be able to reproduce the issue. 27 | 28 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe 29 | disclosure of security bugs. In those cases, please go through the process 30 | outlined on that page and do not file a public issue. 31 | 32 | ## License 33 | By contributing to this project, you agree that your contributions will be licensed 34 | under the LICENSE file in the root directory of this source tree. 35 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | resolver = "2" 3 | members = [ 4 | "controller", 5 | "cuda-sys", 6 | "hyper", 7 | "hyperactor", 8 | "hyperactor_macros", 9 | "hyperactor_multiprocess", 10 | "hyperactor_mesh", 11 | "hyperactor_mesh_macros", 12 | "ndslice", 13 | "monarch_extension", 14 | "monarch_tensor_worker", 15 | "monarch_rdma", 16 | "nccl-sys", 17 | "rdmacore-sys", 18 | "torch-sys", 19 | "rdmacore-sys", 20 | "cuda-sys", 21 | ] 22 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Pre-reqs: 2 | # 1. podman (shown below) or just docker 3 | # $ dnf install -y podman podman-docker 4 | # 2. NVIDIA container toolkit 5 | # $ dnf install -y nvidia-container-toolkit 6 | # 7 | # Build: 8 | # $ cd ~/monarch 9 | # $ export TAG_NAME=$USER-dev 10 | # $ docker build --network=host \ 11 | # -t monarch:$TAG_NAME \ 12 | # -f Dockerfile . 13 | # 14 | # Build (with http proxy): 15 | # $ docker build --network=host \ 16 | # --build-arg=http_proxy=$http_proxy \ 17 | # --build-arg=https_proxy=$https_proxy \ 18 | # -t monarch:$TAG_NAME \ 19 | # -f Dockerfile . 20 | # 21 | ARG http_proxy 22 | ARG https_proxy 23 | 24 | FROM pytorch/pytorch:2.7.0-cuda12.6-cudnn9-devel 25 | WORKDIR /monarch 26 | 27 | # export http proxy env vars if build-args are provided 28 | RUN if [ -n "${http_proxy}" ]; then export http_proxy=${http_proxy}; fi && \ 29 | if [ -n "${https_proxy}" ]; then export https_proxy=${https_proxy}; fi 30 | 31 | # Install native dependencies 32 | RUN apt-get update -y && \ 33 | apt-get -y install curl clang liblzma-dev libunwind-dev 34 | 35 | # Install Rust 36 | ENV PATH="/root/.cargo/bin:${PATH}" 37 | RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y 38 | 39 | # Install Python deps as a separate layer to avoid rebuilding if deps do not change 40 | COPY requirements.txt . 41 | RUN pip install --no-cache-dir -r requirements.txt 42 | 43 | # Install monarch 44 | COPY . . 45 | RUN cargo install --path monarch_hyperactor 46 | RUN pip install . 47 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) Meta Platforms, Inc. and affiliates. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /books/hyperactor-book/.gitignore: -------------------------------------------------------------------------------- 1 | book/ 2 | -------------------------------------------------------------------------------- /books/hyperactor-book/README.md: -------------------------------------------------------------------------------- 1 | # hyperactor Documentation Book 2 | 3 | This is the development documentation for the hyperactor system, built using [`mdBook`](https://rust-lang.github.io/mdBook/). 4 | 5 | ## Running the Book 6 | 7 | ### On the **Server** 8 | 9 | To run the book on a remote server (e.g., `devgpu004`): 10 | 11 | ```bash 12 | x2ssh devgpu004.rva5.facebook.com 13 | tmux new -s mdbook 14 | cd ~/fbsource/fbcode/monarch/books/hyperactor-book 15 | mdbook serve 16 | ``` 17 | Then detach with Ctrl+b, then d. 18 | 19 | ### On the **Client** 20 | 21 | To access the remote book from your local browser: 22 | ```bash 23 | autossh -M 0 -N -L 3000:localhost:3000 devgpu004.rva5.facebook.com 24 | ``` 25 | Then open http://localhost:3000 in your browser. 26 | 27 | **Note**: If you don’t have autossh installed, you can install it with: 28 | ```bash 29 | brew install autossh 30 | ``` 31 | 32 | ### Notes 33 | 34 | - The source is located in src/, with structure defined in SUMMARY.md. 35 | - The book will auto-reload in the browser on edits. 36 | 37 | ## Cleaning Up 38 | 39 | To shut down the book server: 40 | 41 | ### Option 1: Reattach and stop 42 | 43 | ```bash 44 | x2ssh devgpu004.rva5.facebook.com 45 | tmux attach -t mdbook 46 | ``` 47 | Inside the session: 48 | - Press Ctrl+C to stop mdbook serve 49 | - Then type exit to close the shell and terminate the tmux session 50 | 51 | ### Option 2: Kill the session directly 52 | 53 | If you don’t want to reattach, you can kill the session from a new shell: 54 | ```bash 55 | x2ssh devgpu004.rva5.facebook.com 56 | tmux kill-session -t mdbook 57 | ``` 58 | 59 | ### Optional: View active tmux sessions 60 | ```bash 61 | tmux ls 62 | ``` 63 | Use this to check whether the mdbook session is still running. 64 | -------------------------------------------------------------------------------- /books/hyperactor-book/book.toml: -------------------------------------------------------------------------------- 1 | [book] 2 | authors = ["Shayne Fletcher"] 3 | language = "en" 4 | src = "src" 5 | title = "Hyperactor Book" 6 | 7 | [output.html] 8 | git-repository-url = "https://github.com/pytorch-labs/monarch" 9 | edit-url-template = "https://github.com/pytorch-labs/monarch/edit/main/books/hyperactor-book/src/{path}" 10 | -------------------------------------------------------------------------------- /books/hyperactor-book/src/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | - [Introduction](./introduction.md) 4 | - [References](references/index.md) 5 | - [Syntax](references/syntax.md) 6 | - [WorldId](references/world_id.md) 7 | - [ProcId](references/proc_id.md) 8 | - [ActorId](references/actor_id.md) 9 | - [PortId](references/port_id.md) 10 | - [GangId](references/gang_id.md) 11 | - [Reference](references/reference.md) 12 | - [Typed References](references/typed_refs.md) 13 | - [Mailboxes and Routers](mailboxes/index.md) 14 | - [Ports](mailboxes/ports.md) 15 | - [MailboxSender](mailboxes/mailbox_sender.md) 16 | - [Reconfigurable Senders](mailboxes/reconfigurable_sender.md) 17 | - [MailboxServer](mailboxes/mailbox_server.md) 18 | - [MailboxClient](mailboxes/mailbox_client.md) 19 | - [Mailbox](mailboxes/mailbox.md) 20 | - [Delivery Semantics](mailboxes/delivery.md) 21 | - [Multiplexers](mailboxes/multiplexer.md) 22 | - [Routers](mailboxes/routers.md) 23 | - [Actors](actors/index.md) 24 | - [Actor](actors/actor.md) 25 | - [Handler](actors/handler.md) 26 | - [RemoteableActor](actors/remotable_actor.md) 27 | - [Checkpointable](actors/checkpointable.md) 28 | - [RemoteActor](actors/remote_actor.md) 29 | - [Binds](actors/binds.md) 30 | - [RemoteHandles](actors/remote_handles.md) 31 | - [ActorHandle](actors/actor_handle.md) 32 | - [Actor Lifecycle](actors/actor_lifecycle.md) 33 | - [Macros](macros/index.md) 34 | - [`#[derive(Handler)]`](macros/handler.md) 35 | - [`#[derive(HandleClient)]`](macros/handle_client.md) 36 | - [`#[derive(RefClient)]`](macros/ref_client.md) 37 | - [`#[derive(Named)]`](macros/named.md) 38 | - [`#[export]`](macros/export.md) 39 | - [`#[forward]`](macros/forward.md) 40 | -------------------------------------------------------------------------------- /books/hyperactor-book/src/actors/binds.md: -------------------------------------------------------------------------------- 1 | # Binds 2 | 3 | The `Binds` trait defines how an actor's ports are associated with the message types it can receive remotely. 4 | ```rust 5 | pub trait Binds: RemoteActor { 6 | fn bind(ports: &Ports); 7 | } 8 | ``` 9 | Implementing `Binds` allows the system to determine which messages can be routed to an actor instance of type `A`. 10 | 11 | ## Code Generation 12 | 13 | In most cases, you do not implement this trait manually. Instead, the `#[export]` macro generates the appropriate `Binds` implementation by registering the actor's supported message types. 14 | 15 | For example: 16 | ```rust 17 | #[hyperactor::export( 18 | spawn = true, 19 | handlers = [ShoppingList], 20 | )] 21 | struct ShoppingListActor; 22 | ``` 23 | Expands to: 24 | ```rust 25 | impl Binds for ShoppingListActor { 26 | fn bind(ports: &Ports) { 27 | ports.bind::(); 28 | } 29 | } 30 | ``` 31 | This ensures that the actor is correctly wired to handle messages of type `ShoppingList` when used in a remote messaging context. 32 | -------------------------------------------------------------------------------- /books/hyperactor-book/src/actors/checkpointable.md: -------------------------------------------------------------------------------- 1 | # Checkpointable 2 | 3 | The `Checkpointable` trait enables an actor to define how its internal state can be saved and restored. This allows actors to participate in checkpointing and recovery mechanisms when supported by the surrounding system. 4 | 5 | ## Trait definition 6 | ```rust 7 | #[async_trait] 8 | pub trait Checkpointable: Send + Sync + Sized { 9 | type State: RemoteMessage; 10 | 11 | async fn save(&self) -> Result; 12 | async fn load(state: Self::State) -> Result; 13 | } 14 | ``` 15 | 16 | ## Associated Type 17 | 18 | - `type State`: A serializable type representing the object's saved state. This must implement `RemoteMessage` so it can serialized and transmitted. 19 | 20 | ## `save` 21 | 22 | Persists the current state of the component. Returns the Returns a `Self::State` value. If the operation fails, returns `CheckpointError::Save`. 23 | 24 | ## `load` 25 | 26 | Reconstructs a new instance from a previously saved `Self::State`. If deserialization or reconstruction fails, returns `CheckpointError::Load`. 27 | 28 | ## `CheckpointError` 29 | 30 | Errors returned by save and load operations: 31 | ```rust 32 | pub enum CheckpointError { 33 | Save(anyhow::Error), 34 | Load(SeqId, anyhow::Error), 35 | } 36 | ``` 37 | 38 | ## Blanket Implementation 39 | 40 | Any type `T` that implements `RemoteMessage` and `Clone` automatically satisfies `Checkpointable`: 41 | ```rust 42 | #[async_trait] 43 | impl Checkpointable for T 44 | where 45 | T: RemoteMessage + Clone, 46 | { 47 | type State = T; 48 | 49 | async fn save(&self) -> Result { 50 | Ok(self.clone()) 51 | } 52 | 53 | async fn load(state: Self::State) -> Result { 54 | Ok(state) 55 | } 56 | } 57 | ``` 58 | This implementation uses `clone()` to produce a checkpoint and simply returns the cloned state in load. 59 | -------------------------------------------------------------------------------- /books/hyperactor-book/src/actors/index.md: -------------------------------------------------------------------------------- 1 | # Actors 2 | 3 | Hyperactor programs are structured around actors: isolated state machines that process messages asynchronously. 4 | 5 | Each actor runs in isolation, and maintains private internal state. Actors interact with the outside world through typed message ports and follow strict lifecycle semantics managed by the runtime. 6 | 7 | This chapter introduces the actor system in hyperactor. We'll cover: 8 | 9 | - The [`Actor`](./actor.md) trait and its lifecycle hooks 10 | - The [`Handler`](./handler.md) trait for defining message-handling behavior 11 | - The [`RemotableActor`](./remotable_actor.md) trait for enabling remote spawning 12 | - The [`Checkpointable`](./checkpointable.md) trait for supporting actor persistence and recovery 13 | - The [`RemoteActor`](./remote_actor.md) marker trait for remotely referencable types 14 | - The [`Binds`](./binds.md) trait for wiring exported ports to reference types 15 | - The [`RemoteHandles`](./remote_handles.md) trait for associating message types with a reference 16 | - The [`ActorHandle`](./actor_handle.md) type for referencing and communicating with running actors 17 | - [Actor Lifecycle](./lifecycle.md), including `Signal` and `ActorStatus` 18 | 19 | Actors are instantiated with parameters and bound to mailboxes, enabling reliable message-passing. The runtime builds upon this foundation to support supervision, checkpointing, and remote interaction via typed references. 20 | -------------------------------------------------------------------------------- /books/hyperactor-book/src/actors/remote_actor.md: -------------------------------------------------------------------------------- 1 | # RemoteActor 2 | 3 | ```rust 4 | pub trait RemoteActor: Named + Send + Sync {} 5 | ``` 6 | This is a marker trait indicating that a type is eligible to serve as a reference to a remote actor (i.e., an actor that may reside on a different proc). 7 | 8 | It requires: 9 | - `Named`: the type must provide a static name. 10 | - `Send + Sync`: the type must be safely transferable and shareable across threads. 11 | -------------------------------------------------------------------------------- /books/hyperactor-book/src/actors/remote_handles.md: -------------------------------------------------------------------------------- 1 | # RemoteHandles 2 | 3 | The `RemoteHandles` trait is a marker used to declare that a given `RemoteActor` type can handle messages of type `M`. 4 | ```rust 5 | pub trait RemoteHandles: RemoteActor {} 6 | ``` 7 | 8 | An implementation like: 9 | ```rust 10 | impl RemoteHandles for ShoppingListActor {} 11 | ``` 12 | means that `ShoppingListActor` is known to handle the `ShoppingList` message type. 13 | 14 | These implementations are typically generated by the `#[export(handlers = [...])]` macro, and are not written by hand. 15 | -------------------------------------------------------------------------------- /books/hyperactor-book/src/introduction.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | This book describes the design and implementation of the hyperactor runtime. 4 | 5 | The goal is to provide a clear, structured explanation of how actors communicate safely and efficiently across distributed systems using hyperactor’s abstractions. 6 | 7 | We hope this becomes the book we wish we had when we started working with Monarch. Work in progress. 8 | -------------------------------------------------------------------------------- /books/hyperactor-book/src/macros/forward.md: -------------------------------------------------------------------------------- 1 | # `#[forward]` 2 | 3 | The `#[hyperactor::forward]` macro connects a user-defined handler trait implementation (like `ShoppingListHandler`) to the core `Handler` trait required by the runtime. 4 | 5 | In short, it generates the boilerplate needed to route incoming messages of type `T` to your high-level trait implementation. 6 | 7 | ## What it generates 8 | 9 | The macro expands to: 10 | ```rust 11 | #[async_trait] 12 | impl Handler for ShoppingListActor { 13 | async fn handle(&mut self, ctx: &Context, message: ShoppingList) -> Result<(), Error> { 14 | ::handle(self, ctx, message).await 15 | } 16 | } 17 | ``` 18 | This avoids having to manually match on enum variants or duplicate message logic. 19 | 20 | ## When to use it 21 | 22 | Use `#[forward(MessageType)]` when: 23 | 24 | - You’ve defined a custom trait (e.g., `ShoppingListHandler`) 25 | - You’re handling a message enum (like `ShoppingList`) 26 | - You want the runtime to route messages to your trait automatically. 27 | 28 | This is most often used alongside `#[derive(Handler)]`, which generates the corresponding handler and client traits for a user-defined message enum. 29 | -------------------------------------------------------------------------------- /books/hyperactor-book/src/macros/index.md: -------------------------------------------------------------------------------- 1 | # Macros 2 | 3 | This section documents the macros provided by hyperactor for actor and message integration. 4 | 5 | These macros support a complete message-passing workflow: from defining message enums and generating client APIs, to routing messages and exporting actors for dynamic or remote use. 6 | 7 | - [`#[derive(Handler)]`](handler.md) — generate message handling and client traits for actor enums 8 | - [`#[derive(HandleClient)]`](handle_client.md) — implement the generated client trait for `ActorHandle` 9 | - [`#[derive(RefClient)]`](ref_client.md) — implement the generated client trait for `ActorRef` 10 | - [`#[derive(Named)]`](named.md) — give a type a globally unique name and port for routing and reflection 11 | - [`#[export]`](export.md) — make an actor remotely spawnable and routable by registering its type, handlers, and and optionally spawnable from outside the current runtime 12 | - [`#[forward]`](forward.md) — route messages to a user-defined handler trait implementation 13 | 14 | ## Macro Summary 15 | 16 | - **`#[derive(Handler)]`** 17 | Generates handler and client traits for a message enum. 18 | 19 | - **`#[derive(HandleClient)]`** 20 | Implements the client trait for `ActorHandle`. 21 | 22 | - **`#[derive(RefClient)]`** 23 | Implements the client trait for `ActorRef`. 24 | 25 | - **`#[derive(Named)]`** 26 | Registers the type with a globally unique name and port. 27 | 28 | - **`#[export]`** 29 | Makes an actor spawnable and routable via inventory. 30 | 31 | - **`#[forward]`** 32 | Forwards messages to a user-defined handler trait implementation. 33 | -------------------------------------------------------------------------------- /books/hyperactor-book/src/macros/ref_client.md: -------------------------------------------------------------------------------- 1 | # `#[derive(RefClient)]` 2 | 3 | While `#[derive(HandleClient)]` enables calling the generated client trait on `ActorHandle`, there are cases where you don’t have a handle, only a reference to an actor (`ActorRef`). This is where `#[derive(RefClient)]` comes in. 4 | 5 | ## What It Adds 6 | 7 | `#[derive(RefClient)]` generates the following implementation: 8 | ```rust 9 | impl ShoppingListClient for ActorRef 10 | where 11 | T: ShoppingListHandler + Send + Sync + 'static 12 | ``` 13 | This allows you to invoke methods like `.add(...)` or `.list(...)` directly on an `ActorRef`. 14 | 15 | In other words, `RefClient` connects the generated `ShoppingListClient` interface (from `Handler`) to the `ActorRef` type, which refers to a remote actor. 16 | 17 | ## Generated Implementation (simplified) 18 | 19 | ```rust 20 | use async_trait::async_trait; 21 | use hyperactor::{ 22 | ActorRef, 23 | anyhow::Error, 24 | cap::{CanSend, CanOpenPort}, 25 | mailbox::open_once_port, 26 | metrics, 27 | Message, 28 | }; 29 | 30 | #[async_trait] 31 | impl ShoppingListClient for ActorRef 32 | where 33 | T: ShoppingListHandler + Send + Sync + 'static, 34 | { 35 | async fn add(&self, caps: &impl CanSend, item: String) -> Result<(), Error> { 36 | self.send(caps, ShoppingList::Add(item)).await 37 | } 38 | 39 | async fn remove(&self, caps: &impl CanSend, item: String) -> Result<(), Error> { 40 | self.send(caps, ShoppingList::Remove(item)).await 41 | } 42 | 43 | async fn exists( 44 | &self, 45 | caps: &impl CanSend + CanOpenPort, 46 | item: String, 47 | ) -> Result { 48 | let (reply_to, recv) = open_once_port(caps)?; 49 | self.send(caps, ShoppingList::Exists(item, reply_to)).await?; 50 | Ok(recv.await?) 51 | } 52 | 53 | async fn list( 54 | &self, 55 | caps: &impl CanSend + CanOpenPort, 56 | ) -> Result, Error> { 57 | let (reply_to, recv) = open_once_port(caps)?; 58 | self.send(caps, ShoppingList::List(reply_to)).await?; 59 | Ok(recv.await?) 60 | } 61 | } 62 | ``` 63 | -------------------------------------------------------------------------------- /books/hyperactor-book/src/mailboxes/index.md: -------------------------------------------------------------------------------- 1 | # Mailboxes and Routers 2 | 3 | Mailboxes are the foundation of message delivery in hyperactor. They coordinate typed ports, routing logic, forwarding, and delivery infrastructure for distributed actors. 4 | 5 | This chapter introduces the components of the mailbox subsystem: 6 | 7 | - [Ports](ports.md): typed channels for local message delivery 8 | - [MailboxSender](mailbox_sender.md): trait-based abstraction for message posting 9 | - [Reconfigurable Senders](reconfigurable_sender.md): deferred wiring and dynamic configuration 10 | - [MailboxServer](mailbox_server.md): bridging incoming message streams into mailboxes 11 | - [MailboxClient](mailbox_client.md): buffering, forwarding, and failure reporting 12 | - [Mailbox](mailbox.md): port registration, binding, and routing 13 | - [Delivery Semantics](delivery.md): envelopes, delivery errors, and failure handling 14 | - [Multiplexers](multiplexer.md): port-level dispatch to local mailboxes 15 | - [Routers](routers.md): prefix-based routing to local or remote destinations 16 | -------------------------------------------------------------------------------- /books/hyperactor-book/src/references/bindings.md: -------------------------------------------------------------------------------- 1 | # Bindings 2 | -------------------------------------------------------------------------------- /books/hyperactor-book/src/references/gang_id.md: -------------------------------------------------------------------------------- 1 | # `GangId` 2 | 3 | A `GangId` identifies a logical group of actors with the same name across all procs in a world. It serves as a convenient shorthand for referring to all root instances of a given actor name. 4 | ```rust 5 | #[derive( 6 | Debug, 7 | Serialize, 8 | Deserialize, 9 | Clone, 10 | PartialEq, 11 | Eq, 12 | PartialOrd, 13 | Hash, 14 | Ord, 15 | Named 16 | )] 17 | pub struct GangId(pub WorldId, pub String); 18 | ``` 19 | - The first field is the WorldId. 20 | - The second field is the shared actor name. 21 | 22 | A `GangId` is conceptually like saying: “the actor named X on every proc in world W.” 23 | 24 | ## Construction 25 | 26 | ```rust 27 | use hyperactor::reference::{GangId, WorldId}; 28 | 29 | let gang = GangId(WorldId("training".into()), "logger".into()); 30 | ``` 31 | 32 | Or using the id! macro: 33 | ```rust 34 | use hyperactor::id; 35 | 36 | let gang = id!(training.logger); 37 | // Equivalent to GangId(WorldId("training".into()), "logger".into()) 38 | ``` 39 | 40 | ## Methods 41 | 42 | ```rust 43 | impl GangId { 44 | pub fn world_id(&self) -> &WorldId; 45 | pub fn name(&self) -> &str; 46 | pub fn actor_id(&self, rank: usize) -> ActorId; 47 | pub fn expand(&self, world_size: usize) -> impl Iterator + '_; 48 | } 49 | ``` 50 | - `.world_id()` returns the world this gang is defined in. 51 | - `.name()` returns the shared actor name (e.g., "logger"). 52 | - `.actor_id(rank)` returns the root actor on that proc. 53 | - `.expand(world_size)` yields all root ActorIds from rank `0..world_size`. 54 | 55 | ## Semantics 56 | 57 | - Gangs are always composed of root actors (`pid = 0`) with a common name. 58 | - Gang references are useful for broadcasting, coordination, or actor discovery. 59 | - They are lightweight and purely name-based; no state is attached to a `GangId`. 60 | 61 | ## Traits 62 | 63 | `GangId` implements: 64 | - `Display` — formatted as world.actor 65 | - `FromStr` — parses from strings like "training.logger" 66 | - `Ord`, `Eq`, `Hash` — usable in maps, registries, and routing 67 | - `Named` — enables type registration and metadata lookup 68 | -------------------------------------------------------------------------------- /books/hyperactor-book/src/references/gangs.md: -------------------------------------------------------------------------------- 1 | # Gangs 2 | -------------------------------------------------------------------------------- /books/hyperactor-book/src/references/index.md: -------------------------------------------------------------------------------- 1 | # References 2 | 3 | This section documents the reference system used throughout hyperactor to identify and communicate with distributed entities. 4 | 5 | References are lightweight, serializable identifiers for **worlds**, **procs**, **actors** **ports**, and **gangs**. They are the backbone of addressing and routing in the runtime. Whether you're sending a message, spawning an actor, or broadcasting to a group, references are how you name things. 6 | 7 | The reference system is: 8 | 9 | - **Uniform**: All references follow a shared syntax and structure. 10 | - **Parsable**: References can be round-tripped from strings and manipulated programmatically. 11 | - **Typed**: While the `Reference` enum is typeless and dynamic, typed references like `ActorRef` and `PortRef` allow safe interaction in APIs. 12 | - **Orderable**: References implement a total order, enabling prefix-based routing and sorted maps. 13 | 14 | In this section, we’ll cover: 15 | 16 | - The [syntax](syntax.md) and string format of references 17 | - The core reference types: 18 | - [`WorldId`](world_id.md) 19 | - [`ProcId`](proc_id.md) 20 | - [`ActorId`](actor_id.md) 21 | - [`PortId`](port_id.md) 22 | - [`GangId`](gang_id.md) 23 | - The [Reference](reference.md), which unifies all reference variants 24 | - [Typed references](typed_refs.md) used in APIs: `ActorRef`, `PortRef`, and `OncePortRef` 25 | -------------------------------------------------------------------------------- /books/hyperactor-book/src/references/port_id.md: -------------------------------------------------------------------------------- 1 | # `PortId` 2 | 3 | A `PortId` identifies a specific port on a particular actor. Ports are the entry points through which messages are delivered to an actor, and each `PortId` is globally unique. 4 | 5 | ```rust 6 | #[derive( 7 | Debug, 8 | Serialize, 9 | Deserialize, 10 | Clone, 11 | PartialEq, 12 | Eq, 13 | PartialOrd, 14 | Hash, 15 | Ord, 16 | Named 17 | )] 18 | pub struct PortId(pub ActorId, pub u64); 19 | ``` 20 | - The first field is the owning `ActorId`. 21 | - The second field is the port number (`u64`), typically derived from the message type’s registered port. 22 | 23 | ## Construction 24 | 25 | ```rust 26 | use hyperactor::reference::{PortId, ActorId}; 27 | 28 | let port = PortId(actor, 42); 29 | ``` 30 | Or via the `id!` macro: 31 | ```rust 32 | use hyperactor::id; 33 | 34 | let port = id!(training[0].logger[1][42]); 35 | // Equivalent to PortId(ActorId(...), 42) 36 | ``` 37 | You can also construct a PortId from an `ActorId` using `.port_id(...)`: 38 | ```rust 39 | let port = actor.port_id(42); 40 | ``` 41 | 42 | ## Methods 43 | 44 | ```rust 45 | impl PortId { 46 | pub fn actor_id(&self) -> &ActorId; 47 | pub fn index(&self) -> u64; 48 | pub fn into_actor_id(self) -> ActorId; 49 | } 50 | ``` 51 | - `.actor_id()` returns the owning actor. 52 | - `.index()` returns the port number. 53 | - `.into_actor_id()` discards the port index and yields the owning actor ID. 54 | 55 | ## Traits 56 | 57 | `PortId` implements: 58 | - `Display` — formatted as `world[rank].actor[pid][port]` 59 | - `FromStr` — parses from strings like `"training[0].logger[1][42]"` 60 | - `Ord`, `Eq`, `Hash` — usable as map keys or for dispatch 61 | - `Named` — supports reflection and typed messaging 62 | -------------------------------------------------------------------------------- /books/hyperactor-book/src/references/proc_id.md: -------------------------------------------------------------------------------- 1 | # `ProcId` 2 | 3 | A `ProcId` identifies a single runtime instance within a world. All actors exist within a proc, and message routing between actors is scoped by the proc’s identity. 4 | ```rust 5 | #[derive( 6 | Debug, 7 | Serialize, 8 | Deserialize, 9 | Clone, 10 | PartialEq, 11 | Eq, 12 | PartialOrd, 13 | Hash, 14 | Ord, 15 | Named 16 | )] 17 | pub struct ProcId(pub WorldId, pub usize); 18 | ``` 19 | 20 | ## Construction 21 | 22 | You can construct a `ProcId` directly: 23 | ```rust 24 | use hyperactor::reference::{WorldId, ProcId}; 25 | 26 | let proc = ProcId(WorldId("training".into()), 0); 27 | ``` 28 | Or statically using the `id!` macro: 29 | ```rust 30 | use hyperactor::id; 31 | 32 | let proc = id!(training[0]); // Equivalent to ProcId(WorldId("training".into()), 0) 33 | ``` 34 | 35 | ## Methods 36 | 37 | ```rust 38 | impl ProcId { 39 | pub fn world_id(&self) -> &WorldId; 40 | pub fn world_name(&self) -> &str; 41 | pub fn rank(&self) -> usize; 42 | pub fn actor_id(&self, name: impl Into, pid: usize) -> ActorId; 43 | } 44 | ``` 45 | - `.world_id()` gives the `WorldId` this proc belongs to. 46 | - `.rank()` returns the proc’s index. 47 | - `.actor_id(name, pid)` constructs an `ActorId` for an actor hosted on this proc. 48 | 49 | # Notes 50 | 51 | Ranks greater than or equal to `1 << (usize::BITS - 1)` are considered user-space procs. These are typically created with `WorldId::random_user_proc()` and are not assigned by the system. 52 | 53 | ## Traits 54 | 55 | ProcId implements: 56 | - `Display` — formatted as `world[rank]` 57 | - `FromStr` — parses from strings like "training[0]" 58 | - `Ord`, `Eq`, `Hash` — usable in maps and sorted structures 59 | - `Named` — enables port lookup and type reflection 60 | -------------------------------------------------------------------------------- /books/hyperactor-book/src/references/syntax.md: -------------------------------------------------------------------------------- 1 | # Syntax 2 | 3 | References in Hyperactor follow a uniform concrete syntax that can be written as strings, parsed at runtime, or constructed statically using the `id!` macro. 4 | 5 | ## String Form 6 | 7 | The canonical string syntax supports hierarchical references, from worlds down to ports: 8 | ```text 9 | world 10 | world[rank] 11 | world[rank].actor // actor[0] 12 | world[rank].actor[pid] 13 | world[rank].actor[pid][port] 14 | world.actor // gang reference 15 | ``` 16 | 17 | These forms can be used wherever a reference is accepted as a string, such as command-line arguments, config files, and logs. 18 | 19 | Examples: 20 | 21 | - `training` — world ID 22 | - `training[0]` — proc 0 in world `training` 23 | - `training[0].logger[1]` — actor named `logger`, pid 1 24 | - `training[0].logger[1][42]` — port 42 of that actor 25 | - `training.logger` — gang reference 26 | 27 | The parser is robust and fails clearly on invalid syntax. 28 | 29 | ## Runtime Parsing 30 | 31 | The `Reference` type implements `FromStr`, so you can parse strings into references: 32 | 33 | ```rust 34 | use hyperactor::reference::Reference; 35 | 36 | let r: Reference = "training[2].worker[0]".parse().unwrap(); 37 | ``` 38 | 39 | It returns a strongly typed enum: `Reference::Actor`, `Reference::Port`, etc. 40 | 41 | ## Static Construction with `id!` 42 | 43 | You can also construct references statically using the `id!` macro. This macro uses the same concrete syntax: 44 | ```rust 45 | use hyperactor::id; 46 | use hyperactor::reference::{WorldId, ProcId, ActorId, PortId, GangId}; 47 | 48 | let w: WorldId = id!(training); 49 | let p: ProcId = id!(training[0]); 50 | let a: ActorId = id!(training[0].logger[1]); 51 | let port: PortId = id!(training[0].logger[1][42]); 52 | let g: GangId = id!(training.logger); 53 | ``` 54 | 55 | The macro expands to correct type constructors and ensures compile-time validity. The `id!()` macro does not produce a `Reference` enum-it constructs the corresponding concrete type directly (e.g., `WorldId`, `ProcId`, `ActorId`). This contrasts with parsing, which always yields a `Reference`. 56 | -------------------------------------------------------------------------------- /books/hyperactor-book/src/references/world_id.md: -------------------------------------------------------------------------------- 1 | # `WorldId` 2 | 3 | A `WorldId` defines the top-level namespace for procs and actors. All procs, actors, ports, and gangs exist within a world. 4 | ```rust 5 | #[derive( 6 | Debug, 7 | Serialize, 8 | Deserialize, 9 | Clone, 10 | PartialEq, 11 | Eq, 12 | PartialOrd, 13 | Hash, 14 | Ord, 15 | Named 16 | )] 17 | pub struct WorldId(pub String); 18 | ``` 19 | 20 | ## Construction 21 | 22 | A `WorldId` wraps a string and can be created directly: 23 | ```rust 24 | use hyperactor::reference::WorldId; 25 | 26 | let world = WorldId("training".into()); 27 | ``` 28 | Or statically using the `id!` macro: 29 | ```rust 30 | use hyperactor::id; 31 | 32 | let world = id!(training); // Equivalent to WorldId("training".into()) 33 | ``` 34 | 35 | ## Methods 36 | 37 | ```rust 38 | impl WorldId { 39 | pub fn name(&self) -> &str; 40 | pub fn proc_id(&self, index: usize) -> ProcId; 41 | pub fn random_user_proc(&self) -> ProcId; 42 | } 43 | ``` 44 | - `.name()` returns the world name string. 45 | - `.proc_id(index)` constructs a `ProcId` rooted in this world. 46 | - `.random_user_proc()` generates a `ProcId` with the high bit set, marking it as a user-space proc ID. 47 | 48 | ## Traits 49 | 50 | `WorldId` implements: 51 | - `Display` — string form is just the world name 52 | - `FromStr` — parses from "training" into WorldId("training") 53 | - `Ord`, `Eq`, `Hash` — suitable for use as map/set keys 54 | - `Named` — used for type reflection and message dispatch 55 | -------------------------------------------------------------------------------- /build-requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | setuptools 3 | setuptools-rust 4 | wheel 5 | numpy 6 | -------------------------------------------------------------------------------- /clippy.toml: -------------------------------------------------------------------------------- 1 | too-many-lines-threshold = 200 2 | await-holding-invalid-types = [ 3 | { path = "tracing::span::Entered", reason = "`Entered` is not aware when a function is suspended: https://docs.rs/tracing/latest/tracing/struct.Span.html#in-asynchronous-code" }, 4 | { path = "tracing::span::EnteredSpan", reason = "`EnteredSpan` is not aware when a function is suspended: https://docs.rs/tracing/latest/tracing/struct.Span.html#in-asynchronous-code" }, 5 | ] 6 | -------------------------------------------------------------------------------- /controller/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/controller:[controller,controller-bin] 2 | 3 | [package] 4 | name = "controller" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [[bin]] 11 | name = "controller_bin" 12 | path = "src/main.rs" 13 | 14 | [dependencies] 15 | anyhow = "1.0.98" 16 | async-trait = "0.1.86" 17 | bincode = "1.3.3" 18 | clap = { version = "4.5.38", features = ["derive", "env", "string", "unicode", "wrap_help"] } 19 | const_format = "0.2" 20 | hyperactor = { version = "0.0.0", path = "../hyperactor" } 21 | hyperactor_mesh = { version = "0.0.0", path = "../hyperactor_mesh" } 22 | hyperactor_multiprocess = { version = "0.0.0", path = "../hyperactor_multiprocess" } 23 | monarch_messages = { version = "0.0.0", path = "../monarch_messages" } 24 | nccl-sys = { path = "../nccl-sys" } 25 | ndslice = { version = "0.0.0", path = "../ndslice" } 26 | pyo3 = { version = "0.24", features = ["anyhow", "multiple-pymethods"] } 27 | serde = { version = "1.0.185", features = ["derive", "rc"] } 28 | serde_json = { version = "1.0.140", features = ["alloc", "float_roundtrip", "unbounded_depth"] } 29 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 30 | torch-sys = { path = "../torch-sys" } 31 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 32 | 33 | [dev-dependencies] 34 | monarch_types = { version = "0.0.0", path = "../monarch_types" } 35 | torch-sys = { version = "0.0.0", path = "../torch-sys" } 36 | -------------------------------------------------------------------------------- /controller/build.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // This is needed due to the controller being built with torch/nccl deps due to monarch_messages. 10 | 11 | fn main() { 12 | // `torch-sys` will set this env var through Cargo `links` metadata. 13 | let lib_path = std::env::var("DEP_TORCH_LIB_PATH").expect("DEP_TORCH_LIB_PATH to be set"); 14 | // Set the rpath so that the dynamic linker can find libtorch and friends. 15 | println!("cargo::rustc-link-arg=-Wl,-rpath,{lib_path}"); 16 | 17 | if let Ok(path) = std::env::var("DEP_NCCL_LIB_PATH") { 18 | println!("cargo::rustc-link-arg=-Wl,-rpath,{path}"); 19 | } 20 | 21 | // Disable new dtags, as conda envs generally use `RPATH` over `RUNPATH`. 22 | println!("cargo::rustc-link-arg=-Wl,--disable-new-dtags"); 23 | 24 | println!("cargo:rustc-link-lib=lzma"); 25 | } 26 | -------------------------------------------------------------------------------- /cuda-sys/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "cuda-sys" 3 | version = "0.0.0" 4 | authors = ["Facebook"] 5 | edition = "2021" 6 | license = "MIT" 7 | links = "cuda" 8 | description = "Rust FFI bindings for CUDA libraries" 9 | 10 | [dependencies] 11 | cxx = "1.0.119" 12 | serde = { version = "1.0.185", features = ["derive", "rc"] } 13 | 14 | [build-dependencies] 15 | bindgen = "0.70.1" 16 | which = "6.0.3" 17 | glob = "0.3.1" 18 | -------------------------------------------------------------------------------- /cuda-sys/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | /* 10 | * Copyright (c) Meta Platforms, Inc. and affiliates. 11 | * All rights reserved. 12 | * 13 | * This source code is licensed under the BSD-style license found in the 14 | * LICENSE file in the root directory of this source tree. 15 | */ 16 | 17 | use cxx::ExternType; 18 | use cxx::type_id; 19 | 20 | /// SAFETY: bindings 21 | unsafe impl ExternType for CUstream_st { 22 | type Id = type_id!("CUstream_st"); 23 | type Kind = cxx::kind::Opaque; 24 | } 25 | 26 | // When building with cargo, this is actually the lib.rs file for a crate. 27 | // Include the generated bindings.rs and suppress lints. 28 | #[allow(non_camel_case_types)] 29 | #[allow(non_upper_case_globals)] 30 | #[allow(non_snake_case)] 31 | mod inner { 32 | #[cfg(cargo)] 33 | include!(concat!(env!("OUT_DIR"), "/bindings.rs")); 34 | } 35 | 36 | pub use inner::*; 37 | 38 | #[cfg(test)] 39 | mod tests { 40 | use std::mem::MaybeUninit; 41 | 42 | use super::*; 43 | 44 | #[test] 45 | fn sanity() { 46 | // SAFETY: testing bindings 47 | unsafe { 48 | let mut version = MaybeUninit::::uninit(); 49 | let result = cuDriverGetVersion(version.as_mut_ptr()); 50 | assert_eq!(result, cudaError_enum(0)); 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /cuda-sys/src/wrapper.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/monarch/27b2e0d942ab98fc9c3e74f4ea512d8f28d515b8/examples/__init__.py -------------------------------------------------------------------------------- /examples/notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Example Notebooks 2 | 3 | This folder contains some basic examples of using the Monarch API in jupyter notebooks. 4 | 5 | ## Setup 6 | 1. Follow the instructions outlined in ../../monarch/README.md to setup Monarch 7 | 2. Pip install jupyter: 8 | `pip install jupyter notebook` 9 | 3. Run your jupyter notebook: `jupyter notebook` 10 | 4. (optiona) In remote settings (as in a devserver), you can also port forward your jupyter notebook to your local machine. e.g. 11 | ``` 12 | # devserver 13 | jupyter notebook --no-browser --port=8098 14 | 15 | #local 16 | ssh -N -L 8098:localhost:8098 17 | ```` 18 | 5. Open localhost:8098 in your browser to see the jupyter notebook 19 | 20 | 21 | ## Manifest 22 | * ping_pong.ipynb - Simple hello world with Actor API + Inter Actor Communication 23 | -------------------------------------------------------------------------------- /hyper/Cargo.toml: -------------------------------------------------------------------------------- 1 | # This file is manually maintained to maintain the abilith to build hyper 2 | # using cargo. The code is annotated with fbcode_build conditionals such that 3 | # it works with both cargo (all oss deps) and buck (full meta deps). 4 | [package] 5 | name = "hyper" 6 | version = "0.0.0" 7 | authors = ["Facebook"] 8 | edition = "2021" 9 | license = "MIT" 10 | 11 | [dependencies] 12 | anyhow = "1.0.95" 13 | async-trait = "0.1.86" 14 | chrono = { version = "=0.4.39", features = ["clock", "serde", "std"], default-features = false } 15 | clap = { version = "4.5.30", features = ["derive", "env", "string", "unicode", "wrap_help"] } 16 | console = "0.15.7" 17 | hyperactor = { path = "../hyperactor" } 18 | hyperactor_multiprocess = { path = "../hyperactor_multiprocess" } 19 | serde = { version = "1.0.185", features = ["derive", "rc"] } 20 | serde_json = { version = "1.0.132", features = ["float_roundtrip", "unbounded_depth"] } 21 | tabwriter = { version = "1.2.1", features = ["ansi_formatting"] } 22 | tokio = { version = "1.41.0", features = ["full", "test-util", "tracing"] } 23 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 24 | 25 | [lints] 26 | rust = { unexpected_cfgs = { check-cfg = ["cfg(fbcode_build)"], level = "warn" } } 27 | -------------------------------------------------------------------------------- /hyper/src/commands.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | pub mod demo; 10 | pub mod procs; 11 | pub mod serve; 12 | pub mod show; 13 | #[cfg(fbcode_build)] 14 | pub mod top; 15 | -------------------------------------------------------------------------------- /hyper/src/commands/serve.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use std::time::Duration; 10 | 11 | use hyperactor::channel::ChannelAddr; 12 | use hyperactor::channel::ChannelTransport; 13 | use hyperactor_multiprocess::system::System; 14 | 15 | // The commands in the demo spawn temporary actors the join a system. 16 | // Set a long heartbeat duration so we do not check heartbeats for these actors. 17 | // [`Duration::from_secs`] is a stable API. Any APIs with units bigger than secs are unstable. 18 | static LONG_DURATION: Duration = Duration::from_secs(500000); 19 | 20 | #[derive(clap::Args, Debug)] 21 | pub struct ServeCommand { 22 | /// The address to serve the system actor on. If not specified, the local 23 | /// host will be used. 24 | #[arg(short, long)] 25 | addr: Option, 26 | } 27 | 28 | impl ServeCommand { 29 | pub async fn run(self) -> anyhow::Result<()> { 30 | let addr = self.addr.unwrap_or(ChannelAddr::any(ChannelTransport::Tcp)); 31 | let handle = System::serve(addr, LONG_DURATION, LONG_DURATION).await?; 32 | eprintln!("serve: {}", handle.local_addr()); 33 | handle.await; 34 | Ok(()) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /hyper/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | pub mod tui; 10 | pub mod utils; 11 | -------------------------------------------------------------------------------- /hyper/src/tui/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #[cfg(fbcode_build)] 10 | pub mod top; 11 | -------------------------------------------------------------------------------- /hyper/src/utils/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | pub mod system_address; 10 | -------------------------------------------------------------------------------- /hyperactor/src/checkpoint.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | //! Checkpoint functionality for various objects to save and load states. 10 | 11 | use std::fmt::Debug; 12 | 13 | use async_trait::async_trait; 14 | 15 | use crate::RemoteMessage; 16 | use crate::mailbox::log::SeqId; 17 | 18 | /// Errors that occur during checkpoint operations. 19 | /// This enum is marked non-exhaustive to allow for extensibility. 20 | #[derive(thiserror::Error, Debug)] 21 | #[non_exhaustive] 22 | pub enum CheckpointError { 23 | /// An error occured during saving checkpoints. 24 | #[error("save")] 25 | Save(#[source] anyhow::Error), 26 | 27 | /// An error occured during loading checkpoints. 28 | #[error("load: {0}")] 29 | Load(SeqId, #[source] anyhow::Error), 30 | } 31 | 32 | /// [`Checkpoint`] is used to save the state of an instance so that it can be restored later. 33 | #[async_trait] 34 | pub trait Checkpointable: Send + Sync + Sized { 35 | /// The type of the state that is saved. The state can be serialized and deserialized 36 | /// from persistent storage. 37 | type State: RemoteMessage; 38 | 39 | /// Saves the current state. 40 | async fn save(&self) -> Result; 41 | 42 | /// Loads the a state to restore the instance. 43 | async fn load(state: Self::State) -> Result; 44 | } 45 | -------------------------------------------------------------------------------- /hyperactor/src/mailbox/mailbox_admin_message.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use serde::Deserialize; 10 | use serde::Serialize; 11 | 12 | pub use crate as hyperactor; 13 | use crate::HandleClient; 14 | use crate::Handler; 15 | use crate::Named; 16 | use crate::ProcId; 17 | use crate::RefClient; 18 | use crate::mailbox::ChannelAddr; 19 | 20 | /// Messages relating to mailbox administration. 21 | #[derive( 22 | Handler, 23 | HandleClient, 24 | RefClient, 25 | Debug, 26 | Serialize, 27 | Deserialize, 28 | Clone, 29 | PartialEq, 30 | Named 31 | )] 32 | pub enum MailboxAdminMessage { 33 | /// An address update. 34 | UpdateAddress { 35 | /// The ID of the proc. 36 | proc_id: ProcId, 37 | 38 | /// The address at which it listens. 39 | addr: ChannelAddr, 40 | }, 41 | } 42 | -------------------------------------------------------------------------------- /hyperactor/src/metrics.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | //! A bunch of statily defined metrics. Defined here because they are used in 10 | //! both macros and handwritten code. 11 | 12 | use hyperactor_telemetry::declare_static_counter; 13 | use hyperactor_telemetry::declare_static_timer; 14 | use hyperactor_telemetry::declare_static_up_down_counter; 15 | 16 | declare_static_counter!(MESSAGES_SENT, "messages_sent"); 17 | declare_static_counter!(MESSAGES_RECEIVED, "messages_received"); 18 | declare_static_counter!(MESSAGE_HANDLE_ERRORS, "message_handle_errors"); 19 | declare_static_counter!(MESSAGE_RECEIVE_ERRORS, "message_receive_errors"); 20 | declare_static_up_down_counter!(MESSAGE_QUEUE_SIZE, "message_queue_size"); 21 | declare_static_timer!( 22 | MESSAGE_HANDLER_DURATION, 23 | "message_handler_duration", 24 | hyperactor_telemetry::TimeUnit::Nanos 25 | ); 26 | 27 | declare_static_timer!( 28 | ACTOR_STATUS, 29 | "actor.status", 30 | hyperactor_telemetry::TimeUnit::Nanos 31 | ); 32 | -------------------------------------------------------------------------------- /hyperactor/src/spawn.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use std::sync::Arc; 10 | use std::sync::atomic::AtomicU64; 11 | use std::sync::atomic::Ordering; 12 | 13 | use async_trait::async_trait; 14 | 15 | use crate::actor::Actor; 16 | use crate::actor::ActorHandle; 17 | use crate::cap::sealed::CanSpawn; 18 | use crate::mailbox::BoxedMailboxSender; 19 | use crate::reference::ActorId; 20 | #[derive(Debug)] 21 | struct LocalSpawnerState { 22 | root: ActorId, 23 | sender: BoxedMailboxSender, 24 | next_pid: AtomicU64, 25 | } 26 | 27 | #[derive(Clone, Debug)] 28 | pub(crate) struct LocalSpawner(Option>); 29 | 30 | impl LocalSpawner { 31 | pub(crate) fn new(root: ActorId, sender: BoxedMailboxSender) -> Self { 32 | Self(Some(Arc::new(LocalSpawnerState { 33 | root, 34 | sender, 35 | next_pid: AtomicU64::new(1), 36 | }))) 37 | } 38 | 39 | pub(crate) fn new_panicking() -> Self { 40 | Self(None) 41 | } 42 | } 43 | 44 | #[async_trait] 45 | impl CanSpawn for LocalSpawner { 46 | async fn spawn(&self, params: A::Params) -> ActorHandle { 47 | let state = self.0.as_ref().expect("invalid spawner"); 48 | let pid = state.next_pid.fetch_add(1, Ordering::Relaxed); 49 | let actor_id = state.root.child_id(pid); 50 | A::do_spawn(state.sender.clone(), actor_id, params, self.clone()) 51 | .await 52 | .unwrap() 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /hyperactor/src/supervision.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | //! Messages used in supervision. 10 | 11 | use std::fmt::Debug; 12 | 13 | use serde::Deserialize; 14 | use serde::Serialize; 15 | 16 | use crate as hyperactor; // for macros 17 | use crate::Named; 18 | use crate::actor::ActorStatus; 19 | use crate::reference::ActorId; 20 | 21 | /// This is the local actor supervision event. Child actor will propagate this event to its parent. 22 | #[derive(Clone, Debug, Serialize, Deserialize, Named, PartialEq, Eq)] 23 | pub struct ActorSupervisionEvent { 24 | /// The actor id of the child actor where the event is triggered. 25 | actor_id: ActorId, 26 | /// Status of the child actor. 27 | actor_status: ActorStatus, 28 | } 29 | 30 | impl ActorSupervisionEvent { 31 | /// Create a new actor supervision event. 32 | pub fn new(actor_id: ActorId, actor_status: ActorStatus) -> Self { 33 | Self { 34 | actor_id, 35 | actor_status, 36 | } 37 | } 38 | /// Get the actor id of the supervision event. 39 | pub fn actor_id(&self) -> &ActorId { 40 | &self.actor_id 41 | } 42 | /// Get the actor status of the supervision event. 43 | pub fn actor_status(&self) -> &ActorStatus { 44 | &self.actor_status 45 | } 46 | 47 | /// Consume this event to a tuple. 48 | pub fn into_inner(self) -> (ActorId, ActorStatus) { 49 | (self.actor_id, self.actor_status) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /hyperactor/src/sync.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | //! Synchronization primitives that are used by Hyperactor. 10 | //! 11 | //! These are used in related Hyperactor crates as well, and are thus part of the 12 | //! public API. However, they should not be considered a stable part of the Hyperactor 13 | //! API itself, and they may be moved to a different crate in the future. 14 | 15 | pub mod flag; 16 | pub mod monitor; 17 | -------------------------------------------------------------------------------- /hyperactor/src/test_utils.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | /// PingPongActor test util. 10 | pub mod pingpong; 11 | /// ProcSupervisionCoordinator test util. 12 | pub mod proc_supervison; 13 | /// Used to verify behaviors related to process. 14 | pub mod process_assertion; 15 | -------------------------------------------------------------------------------- /hyperactor/src/test_utils/process_assertion.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use std::future::Future; 10 | 11 | use nix::sys::wait::WaitStatus; 12 | use nix::sys::wait::waitpid; 13 | use nix::unistd::ForkResult; 14 | use nix::unistd::fork; 15 | 16 | /// Fork a child process, execute the given function in that process, and verify 17 | /// that the process exits with the given exit code. 18 | pub async fn assert_termination(f: F, expected_code: i32) -> anyhow::Result<()> 19 | where 20 | F: FnOnce() -> Fut, 21 | Fut: Future, 22 | { 23 | // SAFETY: for unit test process assertion. 24 | unsafe { 25 | match fork() { 26 | Ok(ForkResult::Parent { child, .. }) => match waitpid(child, None)? { 27 | WaitStatus::Exited(_, exit_code) => { 28 | anyhow::ensure!(exit_code == expected_code); 29 | Ok(()) 30 | } 31 | status => Err(anyhow::anyhow!( 32 | "didn't receive expected status. got: {:?}", 33 | status 34 | )), 35 | }, 36 | Ok(ForkResult::Child) => Ok(f().await), 37 | Err(_) => Err(anyhow::anyhow!("fork failed")), 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /hyperactor_macros/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/hyperactor_macros:[hyperactor_macros,hyperactor_macros_test] 2 | 3 | [package] 4 | name = "hyperactor_macros" 5 | version = "0.0.0" 6 | authors = ["Facebook "] 7 | edition = "2021" 8 | description = "macros to support the Hyperactor actors and data exchange" 9 | repository = "https://github.com/pytorch-labs/monarch/" 10 | license = "BSD-3-Clause" 11 | 12 | [lib] 13 | test = false 14 | doctest = false 15 | proc-macro = true 16 | 17 | [[test]] 18 | name = "hyperactor_macros_test" 19 | path = "tests/basic.rs" 20 | 21 | [dependencies] 22 | convert_case = "0.6" 23 | indoc = "2.0.2" 24 | proc-macro2 = { version = "1.0.70", features = ["span-locations"] } 25 | quote = "1.0.29" 26 | syn = { version = "2.0.101", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } 27 | 28 | [dev-dependencies] 29 | anyhow = "1.0.98" 30 | async-trait = "0.1.86" 31 | hyperactor = { version = "0.0.0", path = "../hyperactor" } 32 | serde = { version = "1.0.185", features = ["derive", "rc"] } 33 | timed_test = { version = "0.0.0", path = "../timed_test" } 34 | tokio = { version = "1.37.0", features = ["full", "test-util", "tracing"] } 35 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 36 | -------------------------------------------------------------------------------- /hyperactor_macros/build.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | fn main() { 10 | println!("cargo::rustc-check-cfg=cfg(enable_hyperactor_message_logging)"); 11 | } 12 | -------------------------------------------------------------------------------- /hyperactor_mesh/src/metrics.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use hyperactor_telemetry::*; 10 | 11 | declare_static_timer!( 12 | ACTOR_MESH_CAST_DURATION, 13 | "actor_mesh_cast_duration", 14 | TimeUnit::Micros 15 | ); 16 | -------------------------------------------------------------------------------- /hyperactor_mesh/src/test_utils.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use async_trait::async_trait; 10 | use hyperactor::Actor; 11 | use hyperactor::Bind; 12 | use hyperactor::Context; 13 | use hyperactor::Handler; 14 | use hyperactor::Named; 15 | use hyperactor::Unbind; 16 | use serde::Deserialize; 17 | use serde::Serialize; 18 | 19 | /// Message that can be sent to an EmptyActor. 20 | #[derive(Serialize, Deserialize, Debug, Named, Clone, Bind, Unbind)] 21 | pub struct EmptyMessage(); 22 | 23 | /// No-op actor. 24 | #[derive(Debug, PartialEq)] 25 | #[hyperactor::export( 26 | handlers = [ 27 | EmptyMessage { cast = true }, 28 | ], 29 | )] 30 | pub struct EmptyActor(); 31 | 32 | #[async_trait] 33 | impl Actor for EmptyActor { 34 | type Params = (); 35 | 36 | async fn new(_: ()) -> Result { 37 | Ok(Self()) 38 | } 39 | } 40 | 41 | #[async_trait] 42 | impl Handler for EmptyActor { 43 | async fn handle(&mut self, _: &Context, _: EmptyMessage) -> Result<(), anyhow::Error> { 44 | Ok(()) 45 | } 46 | } 47 | hyperactor::remote!(EmptyActor); 48 | -------------------------------------------------------------------------------- /hyperactor_mesh/test/bootstrap.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | /// This is an "empty shell" bootstrap process, 10 | /// simply invoking [`hyperactor_mesh::bootstrap_or_die`]. 11 | #[tokio::main] 12 | async fn main() { 13 | hyperactor_mesh::bootstrap_or_die().await; 14 | } 15 | -------------------------------------------------------------------------------- /hyperactor_mesh/test/process_allocator_cleanup/process_allocator_test_bootstrap.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | /// A simple bootstrap binary that writes logs out to a file. This is useful for 10 | /// debugging, as normally the ProcessAllocator children logs are piped back to 11 | /// ProcessAllocator. When we are testing what happens when we sigkill 12 | /// ProcessAllocator, we want to see what is happening on the children. 13 | #[tokio::main] 14 | async fn main() { 15 | // Initialize tracing to a separate log file per child 16 | let pid = std::process::id(); 17 | let log_file_path = format!("/tmp/child_log{}", pid); 18 | let log_file = std::fs::File::create(&log_file_path).expect("Failed to create log file"); 19 | 20 | tracing_subscriber::fmt() 21 | .with_writer(log_file) 22 | .with_ansi(false) // No color codes in file 23 | .init(); 24 | 25 | // Let the user know where to find our logs 26 | eprintln!("CHILD_LOG_FILE:{}: {}", pid, log_file_path); 27 | 28 | hyperactor_mesh::bootstrap_or_die().await; 29 | } 30 | -------------------------------------------------------------------------------- /hyperactor_mesh_macros/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/hyperactor_mesh_macros:hyperactor_mesh_macros 2 | 3 | [package] 4 | name = "hyperactor_mesh_macros" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [lib] 11 | test = false 12 | doctest = false 13 | proc-macro = true 14 | 15 | [dependencies] 16 | ndslice = { version = "0.0.0", path = "../ndslice" } 17 | proc-macro2 = { version = "1.0.70", features = ["span-locations"] } 18 | quote = "1.0.29" 19 | -------------------------------------------------------------------------------- /hyperactor_mesh_macros/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Clippy can't see through quote! to use of proc-macro2 10 | #![allow(unused_crate_dependencies)] 11 | 12 | extern crate proc_macro; 13 | 14 | use proc_macro::TokenStream; 15 | use quote::quote; 16 | 17 | /// Parse a compact selection expression into a [`Selection`]. See 18 | /// [`selection::parse`] for syntax documentation. 19 | #[proc_macro] 20 | pub fn sel(input: TokenStream) -> TokenStream { 21 | match ndslice::selection::token_parser::parse_tokens(input.into()) { 22 | Ok(selection) => { 23 | let tokens = ndslice::selection::token_parser::selection_to_tokens(&selection); 24 | quote!(#tokens).into() 25 | } 26 | Err(e) => { 27 | let msg = format!("sel! parse failed: {}", e); 28 | quote!(compile_error!(#msg)).into() 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /hyperactor_multiprocess/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/hyperactor_multiprocess:hyperactor_multiprocess 2 | 3 | [package] 4 | name = "hyperactor_multiprocess" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [dependencies] 11 | anyhow = "1.0.98" 12 | async-trait = "0.1.86" 13 | bincode = "1.3.3" 14 | dashmap = { version = "5.5.3", features = ["rayon", "serde"] } 15 | enum-as-inner = "0.6.0" 16 | futures = { version = "0.3.30", features = ["async-await", "compat"] } 17 | hyperactor = { version = "0.0.0", path = "../hyperactor" } 18 | hyperactor_mesh = { version = "0.0.0", path = "../hyperactor_mesh" } 19 | hyperactor_telemetry = { version = "0.0.0", path = "../hyperactor_telemetry" } 20 | remoteprocess = { git = "https://github.com/technicianted/remoteprocess", rev = "72505594a19d80c07df6f1dc4a80556b7e462148" } 21 | serde = { version = "1.0.185", features = ["derive", "rc"] } 22 | thiserror = "2.0.12" 23 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 24 | tokio-retry = "0.3" 25 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 26 | 27 | [dev-dependencies] 28 | maplit = "1.0" 29 | rand = { version = "0.8", features = ["small_rng"] } 30 | regex = "1.11.1" 31 | serde_json = { version = "1.0.140", features = ["alloc", "float_roundtrip", "unbounded_depth"] } 32 | timed_test = { version = "0.0.0", path = "../timed_test" } 33 | tracing-test = { version = "0.2.3", features = ["no-env-filter"] } 34 | 35 | [target.'cfg(not(target_os = "linux"))'.dependencies] 36 | py-spy = { git = "https://github.com/technicianted/py-spy", rev = "8f74f3e4f955fee57f0d4a8103511ee788348a2a" } 37 | 38 | [target.'cfg(target_os = "linux")'.dependencies] 39 | py-spy = { git = "https://github.com/technicianted/py-spy", rev = "8f74f3e4f955fee57f0d4a8103511ee788348a2a", features = ["unwind"] } 40 | -------------------------------------------------------------------------------- /hyperactor_multiprocess/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | //! Multiprocess actor system and support. 10 | 11 | #![feature(assert_matches)] 12 | #![feature(never_type)] 13 | #![deny(missing_docs)] 14 | 15 | /// TODO: add missing doc. 16 | pub mod ping_pong; 17 | pub mod proc_actor; 18 | /// TODO: add missing doc. 19 | pub mod scheduler; 20 | /// TODO: add missing doc. 21 | pub mod supervision; 22 | /// TODO: add missing doc. 23 | pub mod system; 24 | pub mod system_actor; 25 | 26 | /// py-spy wrapper. 27 | pub mod pyspy; 28 | 29 | pub use hyperactor::actor; 30 | pub use system::System; 31 | -------------------------------------------------------------------------------- /hyperactor_multiprocess/src/scheduler.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use async_trait::async_trait; 10 | 11 | /// TODO: add missing doc 12 | #[async_trait] 13 | pub trait Scheduler { 14 | /// TODO: add missing doc 15 | type GangHandle; 16 | /// TODO: add missing doc 17 | async fn schedule_gang(&self, size: u64) -> Result; 18 | } 19 | 20 | /// TODO: add missing doc 21 | pub struct UnimplementedScheduler; 22 | 23 | #[async_trait] 24 | impl Scheduler for UnimplementedScheduler { 25 | type GangHandle = !; 26 | 27 | async fn schedule_gang(&self, _size: u64) -> Result { 28 | unimplemented!() 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /hyperactor_telemetry/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/hyperactor_telemetry:hyperactor_telemetry 2 | 3 | [package] 4 | name = "hyperactor_telemetry" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [dependencies] 11 | anyhow = "1.0.98" 12 | dashmap = { version = "5.5.3", features = ["rayon", "serde"] } 13 | fbinit = { version = "0.2.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main", optional = true } 14 | hdrhistogram = "7.5" 15 | lazy_static = "1.5" 16 | opentelemetry = "0.29" 17 | opentelemetry_sdk = { version = "0.29.0", features = ["rt-tokio"] } 18 | rand = { version = "0.8", features = ["small_rng"] } 19 | scuba = { version = "0.1.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main", optional = true } 20 | serde = { version = "1.0.185", features = ["derive", "rc"] } 21 | serde_json = { version = "1.0.140", features = ["alloc", "float_roundtrip", "unbounded_depth"] } 22 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 23 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 24 | tracing-appender = "0.2.3" 25 | tracing-core = { version = "0.1.33", features = ["valuable"] } 26 | tracing-glog = { version = "0.4.1", features = ["ansi", "tracing-log"] } 27 | tracing-subscriber = { version = "0.3.19", features = ["chrono", "env-filter", "json", "local-time", "parking_lot", "registry"] } 28 | whoami = "1.5" 29 | 30 | [features] 31 | default = [] 32 | fbcode_build = ["fbinit", "scuba"] 33 | 34 | [lints] 35 | rust = { unexpected_cfgs = { check-cfg = ["cfg(fbcode_build)"], level = "warn" } } 36 | -------------------------------------------------------------------------------- /hyperactor_telemetry/src/otel.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #[allow(dead_code)] 10 | pub fn tracing_layer< 11 | S: tracing::Subscriber + for<'span> tracing_subscriber::registry::LookupSpan<'span>, 12 | >() -> Option> { 13 | #[cfg(fbcode_build)] 14 | { 15 | Some(crate::meta::tracing_layer()) 16 | } 17 | #[cfg(not(fbcode_build))] 18 | { 19 | None:: + Send + Sync>> 20 | } 21 | } 22 | 23 | #[allow(dead_code)] 24 | pub fn init_metrics() { 25 | #[cfg(fbcode_build)] 26 | { 27 | opentelemetry::global::set_meter_provider(crate::meta::meter_provider()); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /hyperactor_telemetry/stubs/fbinit/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | //! Stub implementation of fbinit for OSS builds 10 | //! 11 | //! This is a minimal implementation that provides the necessary API surface 12 | //! for code that depends on fbinit, but doesn't actually do anything. 13 | 14 | /// A stub for the fbinit context 15 | #[derive(Clone, Copy, Debug)] 16 | pub struct FacebookInit; 17 | 18 | /// A trait for types that require fbinit 19 | pub trait MainWithFbinit { 20 | fn init_and_run(self, _fb: FacebookInit) -> i32; 21 | } 22 | 23 | /// Initialize the Facebook runtime (stub implementation) 24 | pub fn initialize_with_client_logging(_args: &[&str]) -> FacebookInit { 25 | FacebookInit 26 | } 27 | 28 | /// Initialize the Facebook runtime (stub implementation) 29 | pub fn initialize() -> FacebookInit { 30 | FacebookInit 31 | } 32 | 33 | /// Run a function with fbinit (stub implementation) 34 | pub fn run_with_init(f: F) -> R 35 | where 36 | F: FnOnce(FacebookInit) -> R, 37 | { 38 | f(FacebookInit) 39 | } 40 | -------------------------------------------------------------------------------- /hyperactor_telemetry/tester/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/hyperactor_telemetry/tester:tester 2 | 3 | [package] 4 | name = "tester" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [[bin]] 11 | name = "tester" 12 | path = "main.rs" 13 | 14 | [dependencies] 15 | hyperactor_telemetry = { version = "0.0.0", path = ".." } 16 | opentelemetry = "0.29" 17 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 18 | 19 | [lints] 20 | rust = { unexpected_cfgs = { check-cfg = ["cfg(fbcode_build)"], level = "warn" } } 21 | -------------------------------------------------------------------------------- /hyperactor_telemetry/tester/main.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use hyperactor_telemetry::DefaultTelemetryClock; 10 | use hyperactor_telemetry::declare_static_counter; 11 | use hyperactor_telemetry::declare_static_gauge; 12 | use hyperactor_telemetry::declare_static_histogram; 13 | use hyperactor_telemetry::initialize_logging; 14 | 15 | // Declare static metrics for testing 16 | declare_static_counter!(REQUEST_COUNT, "test_requests"); 17 | declare_static_gauge!(MEMORY_USAGE, "test_memory_usage"); 18 | declare_static_histogram!(REQUEST_DURATION, "test_request_duration"); 19 | 20 | #[tracing::instrument] 21 | fn something_an_actor_would_do() { 22 | tracing::debug!("debug message"); 23 | } 24 | 25 | fn main() { 26 | // Initialize logging with default configuration 27 | initialize_logging(DefaultTelemetryClock {}); 28 | tracing::info!("info log"); 29 | } 30 | -------------------------------------------------------------------------------- /monarch_extension/build.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | fn main() { 10 | // Only set torch-related rpaths if tensor_engine feature is enabled 11 | #[cfg(feature = "tensor_engine")] 12 | { 13 | // `torch-sys` will set this env var through Cargo `links` metadata. 14 | let lib_path = std::env::var("DEP_TORCH_LIB_PATH").expect("DEP_TORCH_LIB_PATH to be set"); 15 | // Set the rpath so that the dynamic linker can find libtorch and friends. 16 | println!("cargo::rustc-link-arg=-Wl,-rpath,{lib_path}"); 17 | 18 | if let Ok(path) = std::env::var("DEP_NCCL_LIB_PATH") { 19 | println!("cargo::rustc-link-arg=-Wl,-rpath,{path}"); 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /monarch_extension/src/blocking.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use libc::atexit; 10 | use pyo3::prelude::*; 11 | use tokio::time::Duration; 12 | 13 | extern "C" fn exit_handler() { 14 | loop { 15 | #[allow(clippy::disallowed_methods)] 16 | std::thread::sleep(Duration::from_secs(60)); 17 | } 18 | } 19 | 20 | /// A function that blocks when called. 21 | /// This is used for testing stuck jobs in the Python bindings. 22 | #[pyfunction] 23 | pub fn blocking_function() { 24 | // SAFETY: 25 | // This is in order to simulate a process in tests that never exits. 26 | unsafe { 27 | atexit(exit_handler); 28 | } 29 | } 30 | 31 | /// Register Python bindings for the blocking module. 32 | pub fn register_python_bindings(module: &Bound<'_, PyModule>) -> PyResult<()> { 33 | let f = wrap_pyfunction!(blocking_function, module)?; 34 | f.setattr( 35 | "__module__", 36 | "monarch._rust_bindings.monarch_extension.blocking", 37 | )?; 38 | module.add_function(f)?; 39 | Ok(()) 40 | } 41 | -------------------------------------------------------------------------------- /monarch_extension/src/panic.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use pyo3::prelude::*; 10 | 11 | /// A function that panics when called. 12 | /// This is used for testing panic handling in the Python bindings. 13 | #[pyfunction] 14 | pub fn panicking_function() { 15 | panic!("This is a deliberate panic from panicking_function"); 16 | } 17 | 18 | /// Register Python bindings for the panic module. 19 | pub fn register_python_bindings(module: &Bound<'_, PyModule>) -> PyResult<()> { 20 | let f = wrap_pyfunction!(panicking_function, module)?; 21 | f.setattr( 22 | "__module__", 23 | "monarch._rust_bindings.monarch_extension.panic", 24 | )?; 25 | module.add_function(f)?; 26 | Ok(()) 27 | } 28 | -------------------------------------------------------------------------------- /monarch_extension/src/simulation_tools.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use hyperactor::clock::Clock; 10 | use hyperactor::clock::SimClock; 11 | use hyperactor::simnet; 12 | use pyo3::prelude::*; 13 | 14 | #[pyfunction] 15 | #[pyo3(name = "start_event_loop")] 16 | pub fn start_simnet_event_loop(py: Python) -> PyResult> { 17 | pyo3_async_runtimes::tokio::future_into_py(py, async move { 18 | simnet::start(); 19 | Ok(()) 20 | }) 21 | } 22 | 23 | #[pyfunction] 24 | #[pyo3(name="sleep",signature=(seconds))] 25 | pub fn py_sim_sleep<'py>(py: Python<'py>, seconds: f64) -> PyResult> { 26 | let millis = (seconds * 1000.0).ceil() as u64; 27 | pyo3_async_runtimes::tokio::future_into_py(py, async move { 28 | let duration = tokio::time::Duration::from_millis(millis); 29 | SimClock.sleep(duration).await; 30 | Ok(()) 31 | }) 32 | } 33 | 34 | pub(crate) fn register_python_bindings(simulation_tools_mod: &Bound<'_, PyModule>) -> PyResult<()> { 35 | { 36 | let f = wrap_pyfunction!(py_sim_sleep, simulation_tools_mod)?; 37 | f.setattr( 38 | "__module__", 39 | "monarch._rust_bindings.monarch_extension.simulation_tools", 40 | )?; 41 | simulation_tools_mod.add_function(f)?; 42 | } 43 | { 44 | let f = wrap_pyfunction!(start_simnet_event_loop, simulation_tools_mod)?; 45 | f.setattr( 46 | "__module__", 47 | "monarch._rust_bindings.monarch_extension.simulation_tools", 48 | )?; 49 | simulation_tools_mod.add_function(f)?; 50 | } 51 | Ok(()) 52 | } 53 | -------------------------------------------------------------------------------- /monarch_hyperactor/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/monarch_hyperactor:[monarch_hyperactor,process_allocator-oss] 2 | 3 | [package] 4 | name = "monarch_hyperactor" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [dependencies] 11 | anyhow = "1.0.98" 12 | async-once-cell = "0.4.2" 13 | async-trait = "0.1.86" 14 | bincode = "1.3.3" 15 | clap = { version = "4.5.38", features = ["derive", "env", "string", "unicode", "wrap_help"] } 16 | erased-serde = "0.3.27" 17 | fbinit = { version = "0.2.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main" } 18 | futures = { version = "0.3.30", features = ["async-await", "compat"] } 19 | futures-util = { version = "0.3.30", features = ["compat"] } 20 | hyperactor = { version = "0.0.0", path = "../hyperactor" } 21 | hyperactor_mesh = { version = "0.0.0", path = "../hyperactor_mesh" } 22 | hyperactor_multiprocess = { version = "0.0.0", path = "../hyperactor_multiprocess" } 23 | hyperactor_telemetry = { version = "0.0.0", path = "../hyperactor_telemetry" } 24 | inventory = "0.3.8" 25 | monarch_types = { version = "0.0.0", path = "../monarch_types" } 26 | ndslice = { version = "0.0.0", path = "../ndslice" } 27 | nix = { version = "0.29.0", features = ["dir", "event", "hostname", "inotify", "ioctl", "mman", "mount", "net", "poll", "ptrace", "reboot", "resource", "sched", "signal", "term", "time", "user", "zerocopy"] } 28 | pyo3 = { version = "0.24", features = ["anyhow", "multiple-pymethods"] } 29 | pyo3-async-runtimes = { version = "0.24", features = ["attributes", "tokio-runtime"] } 30 | serde = { version = "1.0.185", features = ["derive", "rc"] } 31 | serde_bytes = "0.11" 32 | tempfile = "3.15" 33 | thiserror = "2.0.12" 34 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 35 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 36 | 37 | [dev-dependencies] 38 | dir-diff = "0.3" 39 | -------------------------------------------------------------------------------- /monarch_hyperactor/src/bin/process_allocator/main.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | mod common; 10 | 11 | use std::str::FromStr; 12 | 13 | use clap::Parser; 14 | use common::Args; 15 | use common::main_impl; 16 | use hyperactor::channel::ChannelAddr; 17 | 18 | #[tokio::main] 19 | async fn main() { 20 | let args = Args::parse(); 21 | hyperactor::initialize_with_current_runtime(); 22 | 23 | let bind = args 24 | .addr 25 | .unwrap_or_else(|| format!("tcp![::]:{}", args.port)); 26 | 27 | let serve_address = ChannelAddr::from_str(&bind).unwrap(); 28 | 29 | let _ = main_impl(serve_address, args.program).await.unwrap(); 30 | } 31 | -------------------------------------------------------------------------------- /monarch_hyperactor/src/bootstrap.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use hyperactor_mesh::bootstrap_or_die; 10 | use pyo3::Bound; 11 | use pyo3::PyAny; 12 | use pyo3::PyResult; 13 | use pyo3::Python; 14 | use pyo3::pyfunction; 15 | use pyo3::types::PyAnyMethods; 16 | use pyo3::types::PyModule; 17 | use pyo3::types::PyModuleMethods; 18 | use pyo3::wrap_pyfunction; 19 | 20 | #[pyfunction] 21 | #[pyo3(signature = ())] 22 | pub fn bootstrap_main(py: Python) -> PyResult> { 23 | // SAFETY: this is a correct use of this function. 24 | let _ = unsafe { 25 | fbinit::perform_init(); 26 | }; 27 | 28 | hyperactor::tracing::debug!("entering async bootstrap"); 29 | pyo3_async_runtimes::tokio::future_into_py::<_, ()>(py, async move { 30 | // SAFETY: 31 | // - Only one of these is ever created. 32 | // - This is the entry point of this program, so this will be dropped when 33 | // no more FB C++ code is running. 34 | let _destroy_guard = unsafe { fbinit::DestroyGuard::new() }; 35 | bootstrap_or_die().await; 36 | }) 37 | } 38 | 39 | pub fn register_python_bindings(hyperactor_mod: &Bound<'_, PyModule>) -> PyResult<()> { 40 | let f = wrap_pyfunction!(bootstrap_main, hyperactor_mod)?; 41 | f.setattr( 42 | "__module__", 43 | "monarch._rust_bindings.monarch_hyperactor.bootstrap", 44 | )?; 45 | hyperactor_mod.add_function(f)?; 46 | 47 | Ok(()) 48 | } 49 | -------------------------------------------------------------------------------- /monarch_hyperactor/src/code_sync.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | pub mod manager; 10 | pub mod rsync; 11 | mod workspace; 12 | 13 | pub use workspace::WorkspaceLocation; 14 | -------------------------------------------------------------------------------- /monarch_hyperactor/src/code_sync/workspace.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use std::path::PathBuf; 10 | 11 | use anyhow::Result; 12 | use serde::Deserialize; 13 | use serde::Serialize; 14 | 15 | #[derive(Clone, Debug, Serialize, Deserialize)] 16 | pub enum WorkspaceLocation { 17 | Constant(PathBuf), 18 | FromEnvVar(String), 19 | } 20 | 21 | impl WorkspaceLocation { 22 | pub fn resolve(&self) -> Result { 23 | Ok(match self { 24 | WorkspaceLocation::Constant(p) => p.clone(), 25 | WorkspaceLocation::FromEnvVar(v) => PathBuf::from(std::env::var(v)?), 26 | }) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /monarch_hyperactor/src/config.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | //! Configuration for Monarch Hyperactor. 10 | //! 11 | //! This module provides monarch-specific configuration attributes that extend 12 | //! the base hyperactor configuration system. 13 | 14 | use hyperactor::attrs::declare_attrs; 15 | 16 | // Declare monarch-specific configuration keys 17 | declare_attrs! { 18 | /// Use a single asyncio runtime for all Python actors, rather than one per actor 19 | pub attr SHARED_ASYNCIO_RUNTIME: bool = false; 20 | } 21 | -------------------------------------------------------------------------------- /monarch_hyperactor/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #![allow(unsafe_op_in_unsafe_fn)] 10 | #![feature(exit_status_error)] 11 | 12 | pub mod actor; 13 | pub mod actor_mesh; 14 | pub mod alloc; 15 | pub mod bootstrap; 16 | pub mod channel; 17 | pub mod code_sync; 18 | pub mod config; 19 | pub mod local_state_broker; 20 | pub mod mailbox; 21 | pub mod ndslice; 22 | pub mod proc; 23 | pub mod proc_mesh; 24 | pub mod runtime; 25 | pub mod selection; 26 | pub mod shape; 27 | pub mod supervision; 28 | pub mod telemetry; 29 | 30 | #[cfg(fbcode_build)] 31 | pub mod meta; 32 | -------------------------------------------------------------------------------- /monarch_hyperactor/src/selection.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use ndslice::selection::Selection; 10 | use pyo3::PyResult; 11 | use pyo3::prelude::*; 12 | use pyo3::types::PyType; 13 | 14 | #[pyclass( 15 | name = "Selection", 16 | module = "monarch._rust_bindings.monarch_hyperactor.selection", 17 | frozen 18 | )] 19 | pub struct PySelection { 20 | inner: Selection, 21 | } 22 | 23 | impl PySelection { 24 | pub(crate) fn inner(&self) -> &Selection { 25 | &self.inner 26 | } 27 | } 28 | 29 | impl From for PySelection { 30 | fn from(inner: Selection) -> Self { 31 | Self { inner } 32 | } 33 | } 34 | 35 | #[pymethods] 36 | impl PySelection { 37 | #[getter] 38 | fn __repr__(&self) -> String { 39 | format!("{:?}", self.inner) 40 | } 41 | 42 | #[classmethod] 43 | #[pyo3(name = "from_string")] 44 | pub fn parse(_cls: Bound<'_, PyType>, input: &str) -> PyResult { 45 | let selection = ndslice::selection::parse::parse(input).map_err(|err| { 46 | pyo3::exceptions::PyValueError::new_err(format!("parse error: {err}")) 47 | })?; 48 | 49 | Ok(PySelection::from(selection)) 50 | } 51 | } 52 | 53 | pub fn register_python_bindings(module: &Bound<'_, PyModule>) -> PyResult<()> { 54 | module.add_class::()?; 55 | Ok(()) 56 | } 57 | -------------------------------------------------------------------------------- /monarch_hyperactor/src/supervision.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use pyo3::create_exception; 10 | use pyo3::exceptions::PyRuntimeError; 11 | use pyo3::prelude::*; 12 | 13 | create_exception!( 14 | monarch._rust_bindings.monarch_hyperactor.supervision, 15 | SupervisionError, 16 | PyRuntimeError 17 | ); 18 | 19 | pub fn register_python_bindings(module: &Bound<'_, PyModule>) -> PyResult<()> { 20 | // Get the Python interpreter instance from the module 21 | let py = module.py(); 22 | // Add the exception to the module using its type object 23 | module.add("SupervisionError", py.get_type::())?; 24 | Ok(()) 25 | } 26 | -------------------------------------------------------------------------------- /monarch_messages/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/monarch_messages:monarch_messages 2 | 3 | [package] 4 | name = "monarch_messages" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [dependencies] 11 | anyhow = "1.0.98" 12 | derive_more = { version = "1.0.0", features = ["full"] } 13 | enum-as-inner = "0.6.0" 14 | hyperactor = { version = "0.0.0", path = "../hyperactor" } 15 | monarch_types = { version = "0.0.0", path = "../monarch_types" } 16 | ndslice = { version = "0.0.0", path = "../ndslice" } 17 | pyo3 = { version = "0.24", features = ["anyhow", "multiple-pymethods"] } 18 | serde = { version = "1.0.185", features = ["derive", "rc"] } 19 | serde_bytes = "0.11" 20 | thiserror = "2.0.12" 21 | torch-sys = { version = "0.0.0", path = "../torch-sys" } 22 | torch-sys-cuda = { version = "0.0.0", path = "../torch-sys-cuda" } 23 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 24 | 25 | [dev-dependencies] 26 | paste = "1.0.14" 27 | -------------------------------------------------------------------------------- /monarch_messages/build.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | fn main() { 10 | // `torch-sys` will set this env var through Cargo `links` metadata. 11 | let lib_path = std::env::var("DEP_TORCH_LIB_PATH").expect("DEP_TORCH_LIB_PATH to be set"); 12 | // Set the rpath so that the dynamic linker can find libtorch and friends. 13 | println!("cargo::rustc-link-arg=-Wl,-rpath,{lib_path}"); 14 | } 15 | -------------------------------------------------------------------------------- /monarch_messages/src/debugger.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // NOTE: Until https://github.com/PyO3/pyo3/pull/4674, `pyo3::pymethods` trigger 10 | // and unsafe-op-in-unsafe-fn warnings. 11 | #![allow(unsafe_op_in_unsafe_fn)] 12 | 13 | use derive_more::From; 14 | use hyperactor::Handler; 15 | use hyperactor::Named; 16 | use pyo3::Bound; 17 | use pyo3::PyResult; 18 | use pyo3::types::PyModule; 19 | use pyo3::types::PyModuleMethods; 20 | use serde::Deserialize; 21 | use serde::Serialize; 22 | 23 | pub fn register_python_bindings(debugger: &Bound<'_, PyModule>) -> PyResult<()> { 24 | debugger.add_class::()?; 25 | Ok(()) 26 | } 27 | 28 | /// Enumerates the actions relevant to PDB debugging sessions. 29 | #[derive(Debug, Deserialize, Clone, Serialize, PartialEq)] 30 | #[pyo3::pyclass(frozen, module = "monarch._rust_bindings.monarch_messages.debugger")] 31 | pub enum DebuggerAction { 32 | /// Sent from worker to client to indicate that the worker has entered 33 | /// a pdb debugging session. 34 | Paused(), 35 | 36 | /// Sent from client to worker to indicate that the client has started 37 | /// the debugging session. 38 | Attach(), 39 | 40 | /// Sent to client or to worker to end the debugging session. 41 | Detach(), 42 | 43 | /// Sent to client or to worker to write bytes to receiver's stdout. 44 | Write { 45 | #[serde(with = "serde_bytes")] 46 | bytes: Vec, 47 | }, 48 | 49 | /// Sent from worker to client to read bytes from client's stdin. 50 | Read { requested_size: usize }, 51 | } 52 | 53 | #[derive(Serialize, Deserialize, Debug, Clone, Named, From, Handler)] 54 | pub enum DebuggerMessage { 55 | Action { action: DebuggerAction }, 56 | } 57 | 58 | hyperactor::alias!( 59 | DebuggerActor, 60 | DebuggerMessage { cast = true }, 61 | ); 62 | -------------------------------------------------------------------------------- /monarch_messages/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #![feature(assert_matches)] 10 | 11 | pub mod client; 12 | pub mod controller; 13 | pub mod debugger; 14 | pub mod wire_value; 15 | pub mod worker; 16 | -------------------------------------------------------------------------------- /monarch_messages/test_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import math 8 | 9 | 10 | def has_nan(t): 11 | return math.isnan(t) 12 | -------------------------------------------------------------------------------- /monarch_rdma/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/monarch_rdma:monarch_rdma 2 | 3 | [package] 4 | name = "monarch_rdma" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [dependencies] 11 | anyhow = "1.0.98" 12 | async-trait = "0.1.86" 13 | cuda-sys = { path = "../cuda-sys" } 14 | hyperactor = { version = "0.0.0", path = "../hyperactor" } 15 | rand = { version = "0.8", features = ["small_rng"] } 16 | rdmacore-sys = { path = "../rdmacore-sys" } 17 | serde = { version = "1.0.185", features = ["derive", "rc"] } 18 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 19 | 20 | [dev-dependencies] 21 | hyperactor_mesh = { version = "0.0.0", path = "../hyperactor_mesh" } 22 | ndslice = { version = "0.0.0", path = "../ndslice" } 23 | timed_test = { version = "0.0.0", path = "../timed_test" } 24 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 25 | 26 | [features] 27 | cuda = [] 28 | default = ["cuda"] 29 | -------------------------------------------------------------------------------- /monarch_rdma/examples/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/monarch_rdma/examples:[parameter_server,parameter_server_bootstrap,parameter_server_example] 2 | 3 | [package] 4 | name = "parameter_server" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [lib] 11 | path = "parameter_server.rs" 12 | 13 | [[bin]] 14 | name = "parameter_server_bootstrap" 15 | path = "bootstrap.rs" 16 | test = false 17 | 18 | [[bin]] 19 | name = "parameter_server_example" 20 | path = "main.rs" 21 | test = false 22 | 23 | [dependencies] 24 | anyhow = "1.0.98" 25 | async-trait = "0.1.86" 26 | buck-resources = "1" 27 | hyperactor = { version = "0.0.0", path = "../../hyperactor" } 28 | hyperactor_mesh = { version = "0.0.0", path = "../../hyperactor_mesh" } 29 | monarch_rdma = { version = "0.0.0", path = ".." } 30 | ndslice = { version = "0.0.0", path = "../../ndslice" } 31 | serde = { version = "1.0.185", features = ["derive", "rc"] } 32 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 33 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 34 | tracing-subscriber = { version = "0.3.19", features = ["chrono", "env-filter", "json", "local-time", "parking_lot", "registry"] } 35 | 36 | [dev-dependencies] 37 | timed_test = { version = "0.0.0", path = "../../timed_test" } 38 | -------------------------------------------------------------------------------- /monarch_rdma/examples/bootstrap.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #![allow(unused)] 10 | use std::hint::black_box; 11 | 12 | use monarch_rdma::RdmaManagerActor; 13 | use parameter_server::ParameterServerActor; 14 | use parameter_server::WorkerActor; 15 | 16 | /// This is an "empty shell" bootstrap process, 17 | /// simply invoking [`hyperactor_mesh::bootstrap_or_die`]. 18 | #[tokio::main] 19 | async fn main() { 20 | tracing_subscriber::fmt() 21 | .with_max_level(tracing::Level::INFO) 22 | .init(); 23 | // The following black_box lines force-link the actors needed for the parameter server 24 | // example to run. Relying on side-effects for actor registration is not consistent across 25 | // all build modes. 26 | let _ = black_box::>(None); 27 | let _ = black_box::>(None); 28 | let _ = black_box::>(None); 29 | hyperactor_mesh::bootstrap_or_die().await; 30 | } 31 | -------------------------------------------------------------------------------- /monarch_rdma/examples/main.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | //! Main running script for parameter server example. 10 | //! 11 | //! This script needs to be kept separate to avoid buck naming collisions. 12 | //! 13 | //! Specifically, parameter_server::run uses ProcAllocator, which spawns 14 | //! the binary defined in //monarch/examples/rdma/bootstrap.rs. 15 | //! 16 | //! If this main script was kept in the same file as parameter_server.rs, then 17 | //! spawning the actors defined in parameter_server would be named e.g. 18 | //! "parameter_server_example::ParameterServerActor", whereas the bootstrap binary 19 | //! expects this to be named "parameter_server::ParameterServerActor". 20 | //! 21 | //! Keeping this file separate allows us to avoid this naming collision. 22 | use parameter_server::run; 23 | 24 | #[tokio::main] 25 | async fn main() -> Result<(), anyhow::Error> { 26 | run(4, 5).await 27 | } 28 | -------------------------------------------------------------------------------- /monarch_rdma/extension/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/monarch_rdma/extension:monarch_rdma_extension 2 | 3 | [package] 4 | name = "monarch_rdma_extension" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [lib] 11 | path = "lib.rs" 12 | test = false 13 | doctest = false 14 | 15 | [dependencies] 16 | hyperactor = { version = "0.0.0", path = "../../hyperactor" } 17 | hyperactor_mesh = { version = "0.0.0", path = "../../hyperactor_mesh" } 18 | monarch_hyperactor = { version = "0.0.0", path = "../../monarch_hyperactor" } 19 | monarch_rdma = { version = "0.0.0", path = ".." } 20 | pyo3 = { version = "0.24", features = ["anyhow", "multiple-pymethods"] } 21 | pyo3-async-runtimes = { version = "0.24", features = ["attributes", "tokio-runtime"] } 22 | serde = { version = "1.0.185", features = ["derive", "rc"] } 23 | serde_json = { version = "1.0.140", features = ["alloc", "float_roundtrip", "unbounded_depth"] } 24 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 25 | -------------------------------------------------------------------------------- /monarch_rdma/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // RDMA requires frequent unsafe code blocks 10 | #![allow(clippy::undocumented_unsafe_blocks)] 11 | 12 | mod ibverbs_primitives; 13 | mod rdma_components; 14 | mod rdma_manager_actor; 15 | mod test_utils; 16 | 17 | #[macro_use] 18 | mod macros; 19 | 20 | pub use ibverbs_primitives::*; 21 | pub use rdma_components::*; 22 | pub use rdma_manager_actor::*; 23 | -------------------------------------------------------------------------------- /monarch_rdma/src/macros.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #[macro_export] 10 | macro_rules! cu_check { 11 | ($result:expr) => { 12 | if $result != cuda_sys::CUresult::CUDA_SUCCESS { 13 | let mut error_string: *const i8 = std::ptr::null(); 14 | cuda_sys::cuGetErrorString($result, &mut error_string); 15 | panic!( 16 | "cuda failure {}:{} {:?} '{}'", 17 | file!(), 18 | line!(), 19 | $result, 20 | std::ffi::CStr::from_ptr(error_string).to_string_lossy() 21 | ); 22 | } 23 | }; 24 | } 25 | -------------------------------------------------------------------------------- /monarch_simulator/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/monarch_simulator:monarch_simulator_lib 2 | 3 | [package] 4 | name = "monarch_simulator_lib" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [dependencies] 11 | anyhow = "1.0.98" 12 | async-trait = "0.1.86" 13 | controller = { version = "0.0.0", path = "../controller" } 14 | dashmap = { version = "5.5.3", features = ["rayon", "serde"] } 15 | futures = { version = "0.3.30", features = ["async-await", "compat"] } 16 | hyperactor = { version = "0.0.0", path = "../hyperactor" } 17 | hyperactor_multiprocess = { version = "0.0.0", path = "../hyperactor_multiprocess" } 18 | lazy_static = "1.5" 19 | monarch_messages = { version = "0.0.0", path = "../monarch_messages" } 20 | monarch_tensor_worker = { version = "0.0.0", path = "../monarch_tensor_worker" } 21 | monarch_types = { version = "0.0.0", path = "../monarch_types" } 22 | ndslice = { version = "0.0.0", path = "../ndslice" } 23 | serde = { version = "1.0.185", features = ["derive", "rc"] } 24 | serde_json = { version = "1.0.140", features = ["alloc", "float_roundtrip", "unbounded_depth"] } 25 | thiserror = "2.0.12" 26 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 27 | torch-sys = { version = "0.0.0", path = "../torch-sys" } 28 | torch-sys-cuda = { version = "0.0.0", path = "../torch-sys-cuda" } 29 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 30 | 31 | [dev-dependencies] 32 | tracing-test = { version = "0.2.3", features = ["no-env-filter"] } 33 | -------------------------------------------------------------------------------- /monarch_simulator/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use hyperactor::actor::ActorError; 10 | use hyperactor::simnet::SimNetError; 11 | 12 | pub mod bootstrap; 13 | mod collective_coordinator; 14 | pub mod controller; 15 | pub mod simulator; 16 | pub mod worker; 17 | 18 | /// The type of error that can occur on channel operations. 19 | #[derive(thiserror::Error, Debug)] 20 | pub enum SimulatorError { 21 | /// Error during simnet operation. 22 | #[error(transparent)] 23 | SimNetError(#[from] SimNetError), 24 | 25 | /// Error during actor operations. 26 | #[error(transparent)] 27 | ActorError(#[from] ActorError), 28 | 29 | /// Simulator cannot find the world with given name. 30 | #[error("World {0} not found")] 31 | WorldNotFound(String), 32 | 33 | /// Cannot find the mesh in simulator. 34 | #[error("Mesh not found {0}")] 35 | MeshNotFound(String), 36 | } 37 | -------------------------------------------------------------------------------- /monarch_tensor_worker/build.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | fn main() { 10 | // `torch-sys` will set this env var through Cargo `links` metadata. 11 | let lib_path = std::env::var("DEP_TORCH_LIB_PATH").expect("DEP_TORCH_LIB_PATH to be set"); 12 | // Set the rpath so that the dynamic linker can find libtorch and friends. 13 | println!("cargo::rustc-link-arg=-Wl,-rpath,{lib_path}"); 14 | } 15 | -------------------------------------------------------------------------------- /monarch_tensor_worker/test_worker_main.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | """ 8 | Simplified version of worker_main.py for testing the monarch_tensor_worker standalone. 9 | 10 | We want a Python entrypoint here because we want to initialize the Monarch 11 | Python extension on the main thread. 12 | """ 13 | 14 | 15 | def main() -> None: 16 | # torch is import to make sure all the dynamic types are registered 17 | import torch # noqa 18 | 19 | # Force CUDA initialization early on. CUDA init is lazy, and Python CUDA 20 | # APIs are guarded to init CUDA if necessary. But our worker calls 21 | # raw libtorch APIs which are not similarly guarded. So just initialize here 22 | # to avoid issues with potentially using uninitialized CUDA state. 23 | torch.cuda.init() 24 | 25 | from monarch._rust_bindings.monarch_extension import ( # @manual=//monarch/monarch_extension:monarch_extension 26 | tensor_worker, 27 | ) 28 | 29 | # pyre-ignore[16] 30 | tensor_worker.worker_main() 31 | 32 | 33 | if __name__ == "__main__": 34 | # Do not add code here, it won't be run. Add them to the function called below. 35 | main() # pragma: no cover 36 | -------------------------------------------------------------------------------- /monarch_types/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/monarch_types:monarch_types 2 | 3 | [package] 4 | name = "monarch_types" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [dependencies] 11 | derive_more = { version = "1.0.0", features = ["full"] } 12 | hyperactor = { version = "0.0.0", path = "../hyperactor" } 13 | pyo3 = { version = "0.24", features = ["anyhow", "multiple-pymethods"] } 14 | serde = { version = "1.0.185", features = ["derive", "rc"] } 15 | serde_bytes = "0.11" 16 | 17 | [dev-dependencies] 18 | anyhow = "1.0.98" 19 | timed_test = { version = "0.0.0", path = "../timed_test" } 20 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 21 | -------------------------------------------------------------------------------- /monarch_types/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #![feature(assert_matches)] 10 | 11 | mod pyobject; 12 | mod python; 13 | mod pytree; 14 | 15 | pub use pyobject::PickledPyObject; 16 | pub use python::SerializablePyErr; 17 | pub use python::TryIntoPyObjectUnsafe; 18 | pub use pytree::PyTree; 19 | -------------------------------------------------------------------------------- /nccl-sys/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "nccl-sys" 3 | version = "0.0.0" 4 | authors = ["Facebook"] 5 | edition = "2021" 6 | license = "MIT" 7 | links = "nccl" 8 | 9 | [dependencies] 10 | cxx = "1.0.119" 11 | serde = { version = "1.0.185", features = ["derive", "rc"] } 12 | 13 | [build-dependencies] 14 | bindgen = "0.70.1" 15 | which = "6.0.3" 16 | glob = "0.3.1" 17 | -------------------------------------------------------------------------------- /nccl-sys/src/nccl.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | -------------------------------------------------------------------------------- /ndslice/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/ndslice:ndslice 2 | 3 | [package] 4 | name = "ndslice" 5 | version = "0.0.0" 6 | authors = ["Facebook "] 7 | edition = "2021" 8 | description = "data structures to support n-d arrays of ranks" 9 | repository = "https://github.com/pytorch-labs/monarch/" 10 | license = "BSD-3-Clause" 11 | 12 | [dependencies] 13 | anyhow = "1.0.98" 14 | enum-as-inner = "0.6.0" 15 | itertools = "0.14.0" 16 | nom = "8" 17 | proc-macro2 = { version = "1.0.70", features = ["span-locations"] } 18 | quote = "1.0.29" 19 | rand = { version = "0.8", features = ["small_rng"] } 20 | serde = { version = "1.0.185", features = ["derive", "rc"] } 21 | thiserror = "2.0.12" 22 | 23 | [dev-dependencies] 24 | proptest = "1.5" 25 | -------------------------------------------------------------------------------- /ndslice/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | //! Core mesh components for the hyperactor framework. 10 | //! 11 | //! Provides [`Slice`], a compact representation of a subset of a 12 | //! multidimensional array. See [`Slice`] for more details. 13 | //! 14 | //! This crate defines the foundational abstractions used in 15 | //! hyperactor's mesh layer, including multidimensional shapes and 16 | //! selection algebra. The crate avoids dependencies on procedural 17 | //! macros and other higher-level constructs, enabling reuse in both 18 | //! runtime and macro contexts. 19 | 20 | #![feature(assert_matches)] 21 | #![recursion_limit = "512"] 22 | 23 | mod slice; 24 | pub use slice::DimSliceIterator; 25 | pub use slice::Slice; 26 | pub use slice::SliceError; 27 | pub use slice::SliceIterator; 28 | 29 | /// Selection algebra for describing multidimensional mesh regions. 30 | pub mod selection; 31 | 32 | /// Core types for representing multidimensional shapes and strides. 33 | pub mod shape; 34 | 35 | /// Reshaping transformations for multidimensional slices and shapes. 36 | pub mod reshape; 37 | 38 | /// The selection expression type used to define routing constraints. 39 | pub use selection::Selection; 40 | /// DSL-style constructors for building `Selection` expressions. 41 | pub use selection::dsl; 42 | /// Represents an interval with an optional end and step, used to 43 | /// define extents in `Shape` and coordinate filters in `Selection`. 44 | pub use shape::Range; 45 | /// Describes the size and layout of a multidimensional mesh. 46 | pub use shape::Shape; 47 | /// Errors that can occur during shape construction or validation. 48 | pub use shape::ShapeError; 49 | 50 | /// Property-based generators for randomized test input. 51 | #[cfg(test)] 52 | pub mod strategy; 53 | 54 | /// Utilities. 55 | pub mod utils; 56 | -------------------------------------------------------------------------------- /preempt_rwlock/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/preempt_rwlock:preempt_rwlock 2 | 3 | [package] 4 | name = "preempt_rwlock" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [dependencies] 11 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 12 | 13 | [dev-dependencies] 14 | anyhow = "1.0.98" 15 | futures = { version = "0.3.30", features = ["async-await", "compat"] } 16 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.pytest.ini_options] 2 | 3 | markers = [ 4 | "oss_skip: marks tests to skip in OSS CI", 5 | ] 6 | asyncio_mode = "auto" 7 | # Default timeout of 5 minutes 8 | timeout = 300 9 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/__init__.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # True iff the rust extension was built with the tensor engine feature. 8 | def has_tensor_engine() -> bool: ... 9 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_extension/__init__.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/monarch/27b2e0d942ab98fc9c3e74f4ea512d8f28d515b8/python/monarch/_rust_bindings/monarch_extension/__init__.pyi -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_extension/blocking.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | def blocking_function() -> None: ... 8 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_extension/code_sync.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from pathlib import Path 8 | from typing import final 9 | 10 | from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh 11 | 12 | from monarch._rust_bindings.monarch_hyperactor.shape import Shape 13 | 14 | class WorkspaceLocation: 15 | """ 16 | Python binding for the Rust WorkspaceLocation enum. 17 | """ 18 | @final 19 | class Constant(WorkspaceLocation): 20 | def __init__(self, path) -> None: ... 21 | 22 | @final 23 | class FromEnvVar(WorkspaceLocation): 24 | def __init__(self, var) -> None: ... 25 | 26 | def resolve(self) -> Path: 27 | """ 28 | Resolve the workspace location to a Path. 29 | """ 30 | ... 31 | 32 | @final 33 | class WorkspaceShape: 34 | """ 35 | Python binding for the Rust WorkspaceShape struct. 36 | """ 37 | @staticmethod 38 | def shared(label: str) -> "WorkspaceShape": ... 39 | @staticmethod 40 | def exclusive() -> "WorkspaceShape": ... 41 | 42 | @final 43 | class RemoteWorkspace: 44 | """ 45 | Python binding for the Rust RemoteWorkspace struct. 46 | """ 47 | def __init__(self, location: WorkspaceLocation, shape: WorkspaceShape) -> None: ... 48 | 49 | @final 50 | class CodeSyncMeshClient: 51 | """ 52 | Python binding for the Rust CodeSyncMeshClient. 53 | """ 54 | @staticmethod 55 | def spawn_blocking( 56 | proc_mesh: ProcMesh, 57 | ) -> CodeSyncMeshClient: ... 58 | async def sync_workspace( 59 | self, 60 | *, 61 | local: str, 62 | remote: RemoteWorkspace, 63 | auto_reload: bool = False, 64 | ) -> None: ... 65 | async def sync_workspaces( 66 | self, 67 | *, 68 | workspaces: list[tuple[str, RemoteWorkspace]], 69 | auto_reload: bool = False, 70 | ) -> None: ... 71 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_extension/logging.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | from typing import final 10 | 11 | from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh 12 | 13 | @final 14 | class LoggingMeshClient: 15 | """ 16 | Python binding for the Rust LoggingMeshClient. 17 | """ 18 | @staticmethod 19 | async def spawn( 20 | proc_mesh: ProcMesh, 21 | ) -> LoggingMeshClient: ... 22 | def set_mode(self, stream_to_client: bool) -> None: ... 23 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_extension/mesh_controller.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from traceback import FrameSummary 8 | from typing import List, NamedTuple, Sequence, Tuple, Union 9 | 10 | from monarch._rust_bindings.monarch_extension import client 11 | from monarch._rust_bindings.monarch_hyperactor.mailbox import PortId 12 | from monarch._rust_bindings.monarch_hyperactor.proc import ActorId 13 | from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh 14 | 15 | from monarch._rust_bindings.monarch_hyperactor.shape import Slice as NDSlice 16 | 17 | class _Controller: 18 | def __init__(self) -> None: ... 19 | def node( 20 | self, 21 | seq: int, 22 | defs: Sequence[object], 23 | uses: Sequence[object], 24 | port: Tuple[PortId, NDSlice] | None, 25 | tracebacks: List[List[FrameSummary]], 26 | ) -> None: ... 27 | def drop_refs(self, refs: Sequence[object]) -> None: ... 28 | def send( 29 | self, 30 | ranks: Union[NDSlice, List[NDSlice]], 31 | msg: NamedTuple, 32 | ) -> None: ... 33 | def _drain_and_stop( 34 | self, 35 | ) -> List[client.LogMessage | client.WorkerResponse | client.DebuggerMessage]: ... 36 | def sync_at_exit(self, port: PortId) -> None: 37 | """ 38 | Controller waits until all nodes that were added are complete, then replies on the 39 | given port. The port will get an exception if there was a known error that was not reported 40 | to any future. 41 | """ 42 | ... 43 | 44 | @property 45 | def broker_id(self) -> Tuple[str, int]: ... 46 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_extension/panic.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | def panicking_function() -> None: ... 8 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_extension/simulation_tools.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | async def sleep(seconds: float) -> None: 8 | """ 9 | Asyncio friendly sleep that waits for the simulator event loop to wake up 10 | """ 11 | ... 12 | 13 | async def start_event_loop() -> None: 14 | """ 15 | Starts the simulator event loop 16 | """ 17 | ... 18 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_extension/simulator_client.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import final 8 | 9 | @final 10 | class SimulatorClient: 11 | """ 12 | A wrapper around [simulator_client::Simulatorclient] to expose it to python. 13 | It is a client to communicate with the simulator service. 14 | 15 | Arguments: 16 | - `system_addr`: Address of the system. 17 | - `world_size`: Number of workers in a given mesh. 18 | """ 19 | 20 | def __init__(self, system_addr: str, world_size: int) -> None: ... 21 | def kill_world(self, world_name: str) -> None: 22 | """ 23 | Kill the world with the given name. 24 | 25 | Arguments: 26 | - `world_name`: Name of the world to kill. 27 | """ 28 | ... 29 | def spawn_mesh( 30 | self, system_addr: str, controller_actor_id: str, worker_world: str 31 | ) -> None: 32 | """ 33 | Spawn a mesh actor. 34 | 35 | Arguments: 36 | - `system_addr`: Address of the system to spawn the mesh in. 37 | - `controller_actor_id`: Actor id of the controller to spawn the mesh in. 38 | - `worker_world`: World of the worker to spawn the mesh in. 39 | """ 40 | ... 41 | 42 | def set_training_script_state_running(self) -> None: 43 | """ 44 | Let the simulator know that the training script is actively sending 45 | commands to the backend 46 | """ 47 | ... 48 | 49 | def set_training_script_state_waiting(self) -> None: 50 | """ 51 | Let the simulator know that the training script is waiting for the 52 | backend to resolve a future 53 | """ 54 | ... 55 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_hyperactor/bootstrap.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | def bootstrap_main() -> None: ... 10 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_hyperactor/channel.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | from enum import Enum 10 | 11 | class ChannelTransport(Enum): 12 | Tcp = "tcp" 13 | MetaTls = "metatls" 14 | Local = "local" 15 | Unix = "unix" 16 | # Sim # TODO add support 17 | 18 | class ChannelAddr: 19 | @staticmethod 20 | def any(transport: ChannelTransport) -> str: 21 | """Returns an "any" address for the given transport type. 22 | 23 | Primarily used to bind servers. The returned string can be 24 | converted into `hyperactor::channel::ChannelAddr` (in Rust) by 25 | calling `hyperactor::channel::ChannelAddr::from_str()`. 26 | """ 27 | ... 28 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_hyperactor/runtime.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | """ 10 | Type hints for the runtime module. 11 | """ 12 | 13 | def sleep_indefinitely_for_unit_tests() -> None: 14 | """ 15 | A test function that sleeps indefinitely in a loop. 16 | This is used for testing signal handling in signal_safe_block_on. 17 | The function will sleep forever until interrupted by a signal. 18 | 19 | Raises: 20 | KeyboardInterrupt: When interrupted by a signal like SIGINT 21 | """ 22 | ... 23 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_hyperactor/selection.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | from typing import final 10 | 11 | @final 12 | class Selection: 13 | """Opaque representation of a selection expression used to represent 14 | constraints over multidimensional shapes. 15 | 16 | Construct via from_string()` and use with mesh APIs to filter, 17 | evaluate, or route over structured topologies. 18 | """ 19 | def __repr__(self) -> str: ... 20 | @classmethod 21 | def from_string(cls, s: str) -> Selection: 22 | """Parse a selection expression from a string. 23 | 24 | Accepts a compact string syntax such as `"(*, 0:4)"` or `"0 & (1 | 2)"`, 25 | and returns a structured Selection object. 26 | 27 | Raises: 28 | ValueError: if the input string is not a valid selection expression. 29 | """ 30 | ... 31 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_hyperactor/supervision.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import final 8 | 9 | @final 10 | class SupervisionError(RuntimeError): 11 | """ 12 | Custom exception for supervision-related errors in monarch_hyperactor. 13 | """ 14 | 15 | ... 16 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_messages/debugger.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import final, Union 8 | 9 | @final 10 | class DebuggerAction: 11 | """Enum representing actions for the debugger communication between worker and client.""" 12 | 13 | class Paused: 14 | """ 15 | Sent from worker to client to indicate that the worker has entered 16 | a pdb debugging session. 17 | """ 18 | 19 | pass 20 | 21 | class Attach: 22 | """ 23 | Sent from client to worker to indicate that the client has started 24 | the debugging session. 25 | """ 26 | 27 | pass 28 | 29 | class Detach: 30 | """Sent to client or to worker to end the debugging session.""" 31 | 32 | pass 33 | 34 | class Write: 35 | """Sent to client or to worker to write bytes to receiver's stdout.""" 36 | 37 | def __init__(self, bytes: bytes) -> None: ... 38 | 39 | class Read: 40 | """Sent from worker to client to read bytes from client's stdin.""" 41 | 42 | def __init__(self, requested_size: int) -> None: ... 43 | @property 44 | def requested_size(self) -> int: 45 | """Get the number of bytes to read from stdin.""" 46 | ... 47 | 48 | DebuggerActionType = Union[ 49 | DebuggerAction.Paused, 50 | DebuggerAction.Attach, 51 | DebuggerAction.Detach, 52 | DebuggerAction.Read, 53 | DebuggerAction.Write, 54 | ] 55 | -------------------------------------------------------------------------------- /python/monarch/_src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/monarch/27b2e0d942ab98fc9c3e74f4ea512d8f28d515b8/python/monarch/_src/__init__.py -------------------------------------------------------------------------------- /python/monarch/_src/actor/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | """ 8 | Monarch Actor API 9 | """ 10 | -------------------------------------------------------------------------------- /python/monarch/_src/actor/code_sync/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from monarch._rust_bindings.monarch_extension.code_sync import ( # noqa: F401 8 | CodeSyncMeshClient, 9 | RemoteWorkspace, 10 | WorkspaceLocation, 11 | WorkspaceShape, 12 | ) 13 | -------------------------------------------------------------------------------- /python/monarch/_src/actor/device_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import os 8 | import re 9 | from pathlib import Path 10 | 11 | 12 | def _local_device_count(): 13 | if "CUDA_VISIBLE_DEVICES" in os.environ: 14 | return len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) 15 | dev_path = Path("/dev") 16 | pattern = re.compile(r"nvidia\d+$") 17 | nvidia_devices = [dev for dev in dev_path.iterdir() if pattern.match(dev.name)] 18 | return len(nvidia_devices) 19 | -------------------------------------------------------------------------------- /python/monarch/_src/actor/telemetry/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | 10 | import logging 11 | 12 | from monarch._rust_bindings.monarch_hyperactor.telemetry import ( # @manual=//monarch/monarch_extension:monarch_extension 13 | forward_to_tracing, 14 | ) 15 | 16 | 17 | class TracingForwarder(logging.Handler): 18 | def emit(self, record: logging.LogRecord) -> None: 19 | forward_to_tracing(record) 20 | -------------------------------------------------------------------------------- /python/monarch/_src/tensor_engine/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /python/monarch/actor/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | """ 8 | Monarch Actor API - Public interface for actor functionality. 9 | """ 10 | 11 | from monarch._src.actor.actor_mesh import ( 12 | Accumulator, 13 | Actor, 14 | ActorError, 15 | current_actor_name, 16 | current_rank, 17 | current_size, 18 | endpoint, 19 | MonarchContext, 20 | Point, 21 | port, 22 | send, 23 | ValueMesh, 24 | ) 25 | from monarch._src.actor.future import Future 26 | from monarch._src.actor.proc_mesh import ( 27 | debug_client, 28 | local_proc_mesh, 29 | proc_mesh, 30 | ProcMesh, 31 | sim_proc_mesh, 32 | ) 33 | 34 | __all__ = [ 35 | "Accumulator", 36 | "Actor", 37 | "ActorError", 38 | "current_actor_name", 39 | "current_rank", 40 | "current_size", 41 | "endpoint", 42 | "Future", 43 | "local_proc_mesh", 44 | "MonarchContext", 45 | "Point", 46 | "proc_mesh", 47 | "ProcMesh", 48 | "port", 49 | "send", 50 | "sim_proc_mesh", 51 | "ValueMesh", 52 | "debug_client", 53 | ] 54 | -------------------------------------------------------------------------------- /python/monarch/actor_mesh.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import warnings 8 | 9 | warnings.warn( 10 | "monarch.actor_mesh is deprecated, please import from monarch.actor instead.", 11 | DeprecationWarning, 12 | stacklevel=2, 13 | ) 14 | 15 | from monarch._src.actor.actor_mesh import * # noqa 16 | -------------------------------------------------------------------------------- /python/monarch/bootstrap_main.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import warnings 8 | 9 | warnings.warn( 10 | "monarch.bootstrap_main is deprecated, please use from monarch._src.actor.bootstrap_main instead.", 11 | DeprecationWarning, 12 | stacklevel=2, 13 | ) 14 | 15 | from monarch._src.actor.bootstrap_main import * # noqa 16 | 17 | 18 | if __name__ == "__main__": 19 | # noqa 20 | invoke_main() # pragma: no cover 21 | -------------------------------------------------------------------------------- /python/monarch/builtins/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | """ 9 | Builtins for Monarch is a set of remote function defintions for PyTorch functions and other utilities. 10 | """ 11 | 12 | from .log import log_remote, set_logging_level_remote 13 | 14 | __all__ = ["log_remote", "set_logging_level_remote"] 15 | -------------------------------------------------------------------------------- /python/monarch/builtins/log.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import logging 8 | 9 | from monarch.common.remote import remote 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | @remote(propagate="inspect") 16 | def log_remote(*args, level: int = logging.WARNING, **kwargs) -> None: 17 | logger.log(level, *args, **kwargs) 18 | 19 | 20 | @remote(propagate="inspect") 21 | def set_logging_level_remote(level: int) -> None: 22 | logger.setLevel(level) 23 | -------------------------------------------------------------------------------- /python/monarch/builtins/random.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre strict 8 | from typing import Callable 9 | 10 | import torch 11 | from monarch.common.remote import remote 12 | 13 | 14 | @remote(propagate="inspect") 15 | def set_manual_seed_remote(seed: int, process_idx: int = 0) -> None: 16 | torch.manual_seed(seed ^ process_idx) 17 | 18 | 19 | @remote(propagate=lambda: torch.zeros(1)) 20 | def get_rng_state_remote() -> torch.Tensor: 21 | return torch.get_rng_state() 22 | 23 | 24 | @remote(propagate="inspect") 25 | def set_rng_state_remote(new_state: torch.Tensor) -> None: 26 | torch.set_rng_state(new_state) 27 | 28 | 29 | def _run_no_return(f: Callable) -> None: 30 | f() 31 | return None 32 | 33 | 34 | # TODO: return result when uint64 is supported from remote function 35 | @remote(propagate=lambda: _run_no_return(torch.seed)) 36 | def seed_remote() -> None: 37 | torch.seed() 38 | 39 | 40 | # same underlying implementation as seed_remote (torch.seed) 41 | # TODO: return result when uint64 is supported from remote function 42 | @remote(propagate=lambda: _run_no_return(torch.random.seed)) 43 | def random_seed_remote() -> None: 44 | torch.random.seed() 45 | 46 | 47 | @remote(propagate="inspect") 48 | def manual_seed_cuda_remote(seed: int) -> None: 49 | torch.cuda.manual_seed(seed) 50 | 51 | 52 | @remote(propagate="inspect") 53 | def manual_seed_all_cuda_remote(seed: int) -> None: 54 | torch.cuda.manual_seed_all(seed) 55 | 56 | 57 | @remote(propagate=lambda: [torch.zeros(1)]) 58 | def get_rng_state_all_cuda_remote() -> list[torch.Tensor]: 59 | return torch.cuda.get_rng_state_all() 60 | 61 | 62 | @remote(propagate="inspect") 63 | def set_rng_state_all_cuda_remote(states: list[torch.Tensor]) -> None: 64 | torch.cuda.set_rng_state_all(states) 65 | 66 | 67 | # initial_seed may sometimes return a uint64 which currenly can't be unwrapped by the framework 68 | # def initial_seed_remote() -> int: ... 69 | -------------------------------------------------------------------------------- /python/monarch/common/_C.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | def patch_cuda() -> None: ... 10 | def mock_cuda() -> None: ... 11 | def unmock_cuda() -> None: ... 12 | -------------------------------------------------------------------------------- /python/monarch/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/monarch/27b2e0d942ab98fc9c3e74f4ea512d8f28d515b8/python/monarch/common/__init__.py -------------------------------------------------------------------------------- /python/monarch/common/base_tensor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | import torch 9 | 10 | 11 | # All of the tensor examples in this zoo inherit from BaseTensor. Ideally, 12 | # however, they would inherit directly from Tensor. This is just our staging 13 | # ground for applying behavior that hasn't yet made it into core but that 14 | # we would like to apply by default. 15 | class BaseTensor(torch.Tensor): 16 | # See https://github.com/pytorch/pytorch/pull/73727 ; this is necessary 17 | # to ensure that super().__new__ can cooperate with each other 18 | @staticmethod 19 | def __new__(cls, elem, *, requires_grad=None): 20 | if requires_grad is None: 21 | return super().__new__(cls, elem) 22 | else: 23 | return cls._make_subclass(cls, elem, requires_grad) 24 | 25 | # If __torch_dispatch__ is defined (which it will be for all our examples) 26 | # the default torch function implementation (which preserves subclasses) 27 | # typically must be disabled 28 | __torch_function__ = torch._C._disabled_torch_function_impl 29 | -------------------------------------------------------------------------------- /python/monarch/common/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | SIM_MESH_CLIENT_TIMEOUT = 5 10 | SIM_MESH_CLIENT_SUPERVISION_UPDATE_INTERVAL = 5 11 | -------------------------------------------------------------------------------- /python/monarch/common/context_manager.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | from functools import wraps 9 | 10 | 11 | class _ContextManager: 12 | def __init__(self, generator): 13 | self.generator = generator 14 | self.generator.send(None) 15 | 16 | def __enter__(self): 17 | return 18 | 19 | def __exit__(self, *args): 20 | try: 21 | self.generator.send(None) 22 | except StopIteration: 23 | pass 24 | else: 25 | raise RuntimeError("context manager generator did not exit") 26 | 27 | 28 | def activate_first_context_manager(func): 29 | """ 30 | Similar to contextlib.contextmanager but it 31 | starts the context when the function is called rather than 32 | than at the start of the with statement. Useful for things where 33 | you want to optionally activate the context without a guard. 34 | """ 35 | 36 | @wraps(func) 37 | def helper(*args, **kwargs): 38 | return _ContextManager(func(*args, **kwargs)) 39 | 40 | return helper 41 | -------------------------------------------------------------------------------- /python/monarch/common/fake.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | from concurrent.futures import ThreadPoolExecutor 9 | from functools import cache 10 | 11 | from torch._subclasses.fake_tensor import FakeTensorMode 12 | 13 | 14 | @cache 15 | def _fake_mode_worker(): 16 | return ThreadPoolExecutor(max_workers=1) 17 | 18 | 19 | @cache 20 | def _fake_mode(): 21 | return FakeTensorMode() 22 | 23 | 24 | def fake_call(fn, *args, **kwargs): 25 | """Execute on work on a ThreadPool worker 26 | 27 | First call (ThreadPoolExecutor init) will take the GIL and may block for long time! 28 | TODO: this will be replaced with something more performant 29 | """ 30 | global _fake_mode_worker, fake_mode 31 | 32 | # # Calls FakeTensorMode while re-enabling version counter tracking 33 | # # todo(chilli): I'm not totally sure why I need to disable python dispatch 34 | # # key. Perhaps there's some unwrapping that should have happened further up. 35 | # include_to_set = torch._C._dispatch_tls_local_include_set() 36 | # exclude_to_set = ( 37 | # torch._C._dispatch_tls_local_exclude_set() 38 | # | torch._C.DispatchKeySet(torch._C.DispatchKey.Python) 39 | # ) - torch._C.DispatchKeySet(torch._C.DispatchKey.ADInplaceOrView) 40 | 41 | # def work(): 42 | # with torch._C._ForceDispatchKeyGuard(include_to_set, exclude_to_set): 43 | # with fake_mode: 44 | # return fn(*args, **kwargs) 45 | 46 | # return work() 47 | 48 | def work(): 49 | # fake mode must be initialized in the worker thread 50 | # otherwise a monarch dispatch mode may be active, causing 51 | # FakeTensorMode to initialize wrong. 52 | with _fake_mode(): 53 | return fn(*args, **kwargs) 54 | 55 | return _fake_mode_worker().submit(work).result() 56 | -------------------------------------------------------------------------------- /python/monarch/common/init.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include 10 | // @lint-ignore CLANGTIDY facebook-hte-RelativeInclude 11 | #include "mock_cuda.h" 12 | 13 | static PyMethodDef _C_methods[] = { 14 | {"patch_cuda", 15 | patch_cuda, 16 | METH_NOARGS, 17 | "Initialize the monarch cuda patch."}, 18 | {"mock_cuda", mock_cuda, METH_NOARGS, "Enable cuda mocking."}, 19 | {"unmock_cuda", unmock_cuda, METH_NOARGS, "Disable cuda mocking."}, 20 | {NULL, NULL, 0, NULL}}; 21 | 22 | static struct PyModuleDef _C_module = { 23 | PyModuleDef_HEAD_INIT, 24 | "_C", 25 | "A module containing monarch C++ functionality.", 26 | -1, 27 | _C_methods, 28 | NULL, 29 | NULL, 30 | NULL, 31 | NULL}; 32 | 33 | PyMODINIT_FUNC PyInit__C(void) { 34 | return PyModule_Create(&_C_module); 35 | } 36 | -------------------------------------------------------------------------------- /python/monarch/common/mock_cuda.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | 13 | PyObject* patch_cuda(PyObject*, PyObject*); 14 | PyObject* mock_cuda(PyObject*, PyObject*); 15 | PyObject* unmock_cuda(PyObject*, PyObject*); 16 | -------------------------------------------------------------------------------- /python/monarch/common/mock_cuda.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | from contextlib import contextmanager 9 | from typing import Generator, Optional 10 | 11 | import monarch.common._C # @manual=//monarch/python/monarch/common:_C 12 | import torch 13 | 14 | monarch.common._C.patch_cuda() 15 | 16 | _mock_cuda_stream: Optional[torch.cuda.Stream] = None 17 | 18 | 19 | def get_mock_cuda_stream() -> torch.cuda.Stream: 20 | global _mock_cuda_stream 21 | if _mock_cuda_stream is None: 22 | _mock_cuda_stream = torch.cuda.Stream() 23 | return _mock_cuda_stream 24 | 25 | 26 | @contextmanager 27 | def mock_cuda_guard() -> Generator[None, None, None]: 28 | try: 29 | with torch.cuda.stream(get_mock_cuda_stream()): 30 | monarch.common._C.mock_cuda() 31 | yield 32 | finally: 33 | monarch.common._C.unmock_cuda() 34 | 35 | 36 | def mock_cuda() -> None: 37 | monarch.common._C.mock_cuda() 38 | 39 | 40 | def unmock_cuda() -> None: 41 | monarch.common._C.unmock_cuda() 42 | -------------------------------------------------------------------------------- /python/monarch/common/process_group.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | 9 | import logging 10 | 11 | import torch.distributed as dist 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def _wrap_method(process_group: dist.ProcessGroup, method): 17 | def wrapper(*args, **kwargs): 18 | logger.debug( 19 | "ProcessGroup Call: %s with args %s and kwargs %s", method, args, kwargs 20 | ) 21 | fn = getattr(process_group, method) 22 | try: 23 | return fn(*args, **kwargs) 24 | except Exception as e: 25 | logger.warning( 26 | "ProcessGroup Call: %s with args %s and kwargs %s failed with exception: %s", 27 | method, 28 | args, 29 | kwargs, 30 | str(e), 31 | ) 32 | # TODO(rajeshn): send a message back to the controller that this 33 | # worker had a failed communication event 34 | raise e 35 | 36 | return wrapper 37 | 38 | 39 | class SingleControllerProcessGroupWrapper: 40 | """ 41 | Wraps a ProcessGroup object to provide a single controller process group. This provides us a hook to observe 42 | all the operatons on the process group to the controller. 43 | """ 44 | 45 | def __new__(cls, pg: dist.ProcessGroup): 46 | instance = super().__new__(cls) 47 | 48 | for attr in dir(type(pg)): 49 | if not attr.startswith("__") and callable(getattr(type(pg), attr)): 50 | setattr(instance, attr, _wrap_method(pg, attr)) 51 | 52 | return instance 53 | 54 | def __init__(self, process_group): 55 | self.process_group = process_group 56 | -------------------------------------------------------------------------------- /python/monarch/common/reference.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | from typing import Optional 9 | 10 | from monarch._rust_bindings.monarch_extension.tensor_worker import Ref 11 | 12 | 13 | class Referenceable: 14 | def __init__(self): 15 | self.ref: Optional[int] = None 16 | 17 | def delete_ref(self, ref): 18 | raise NotImplementedError("no delete_ref method") 19 | 20 | def __reduce_ex__(self, protocol): 21 | assert ( 22 | self.ref is not None 23 | ), f"{self} is being sent but does not have a reference" 24 | return Ref, (self.ref,) 25 | 26 | # Used by rust backend to get the ref for this object 27 | def __monarch_ref__(self) -> int: 28 | assert self.ref is not None 29 | return self.ref 30 | 31 | def __del__(self): 32 | if self.ref is not None: 33 | self.delete_ref(self.ref) 34 | -------------------------------------------------------------------------------- /python/monarch/common/selection.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from monarch._rust_bindings.monarch_hyperactor.selection import Selection 8 | 9 | __all__ = ["Selection"] 10 | -------------------------------------------------------------------------------- /python/monarch/common/tensor_factory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | from typing import NamedTuple, Tuple 9 | 10 | import torch 11 | 12 | 13 | class TensorFactory(NamedTuple): 14 | size: Tuple[int, ...] 15 | dtype: torch.dtype 16 | layout: torch.layout 17 | device: torch.device 18 | 19 | @staticmethod 20 | def from_tensor(t): 21 | return TensorFactory(t.size(), t.dtype, t.layout, t.device) 22 | 23 | def empty(self): 24 | return torch.empty( 25 | self.size, dtype=self.dtype, layout=self.layout, device=self.device 26 | ) 27 | 28 | def zeros(self): 29 | return torch.full( 30 | self.size, 0, dtype=self.dtype, layout=self.layout, device=self.device 31 | ) 32 | -------------------------------------------------------------------------------- /python/monarch/controller/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | -------------------------------------------------------------------------------- /python/monarch/controller/debugger.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | import sys 9 | from typing import Optional 10 | 11 | _is_ipython: Optional[bool] = None 12 | 13 | 14 | def is_ipython() -> bool: 15 | global _is_ipython 16 | if _is_ipython is not None: 17 | return _is_ipython 18 | try: 19 | from IPython import get_ipython 20 | 21 | _is_ipython = get_ipython() is not None 22 | except ImportError: 23 | _is_ipython = False 24 | return _is_ipython 25 | 26 | 27 | def write(msg: str) -> None: 28 | sys.stdout.write(msg) 29 | sys.stdout.flush() 30 | 31 | 32 | def read(requested_size: int) -> bytes: 33 | if not is_ipython(): 34 | b = bytearray(requested_size) 35 | bytes_read = sys.stdin.buffer.raw.readinto(b) 36 | return bytes(b[:bytes_read]) 37 | 38 | # ipython doesn't have stdin directly connected 39 | # so we need to use input() instead. 40 | user_input = input() + "\n" 41 | input_bytes = user_input.encode("utf-8") 42 | num_bytes_to_write = len(input_bytes) 43 | if requested_size < num_bytes_to_write: 44 | raise RuntimeError( 45 | f"Debugger input line too long, max length is {requested_size}" 46 | ) 47 | return input_bytes[:num_bytes_to_write] 48 | -------------------------------------------------------------------------------- /python/monarch/controller/rust_backend/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | -------------------------------------------------------------------------------- /python/monarch/fetch.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | """ 9 | This is a utility file for fetching a shard of a tensor from remote. 10 | """ 11 | 12 | from typing import cast, TypeVar 13 | 14 | from monarch.common.device_mesh import no_mesh 15 | 16 | from monarch.common.future import Future 17 | 18 | from monarch.common.remote import call_on_shard_and_fetch, remote_identity 19 | 20 | T = TypeVar("T") 21 | 22 | 23 | def fetch_shard( 24 | obj: T, shard: dict[str, int] | None = None, **kwargs: int 25 | ) -> Future[T]: 26 | """ 27 | Retrieve the shard at `coordinates` of the current device mesh of each 28 | tensor in obj. All tensors in `obj` will be fetched to the CPU device. 29 | obj - a pytree containing the tensors the fetch 30 | shard - a dictionary from mesh dimension name to coordinate of the shard 31 | If None, this will fetch from coordinate 0 for all dimensions (useful after all_reduce/all_gather) 32 | preprocess - a 33 | **kwargs - additional keyword arguments are added as entries to the shard dictionary 34 | """ 35 | if kwargs: 36 | if shard is None: 37 | shard = {} 38 | shard.update(kwargs) 39 | 40 | return cast("Future[T]", call_on_shard_and_fetch(remote_identity, obj, shard=shard)) 41 | 42 | 43 | def show(obj: T, shard: dict[str, int] | None = None, **kwargs: int) -> object: 44 | v = inspect(obj, shard=shard, **kwargs) 45 | # pyre-ignore 46 | from torchshow import show # @manual 47 | 48 | with no_mesh.activate(): 49 | return show(v) 50 | 51 | 52 | def inspect(obj: T, shard: dict[str, int] | None = None, **kwargs: int) -> T: 53 | return fetch_shard(obj, shard=shard, **kwargs).result() 54 | -------------------------------------------------------------------------------- /python/monarch/gradient/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | from ._gradient_generator import GradientGenerator 10 | 11 | __all__ = ["GradientGenerator"] 12 | -------------------------------------------------------------------------------- /python/monarch/gradient/_gradient_generator.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | from typing import Any, Optional 9 | 10 | import torch 11 | 12 | class GradientGenerator: 13 | def __init__( 14 | self, 15 | roots_list: Any, 16 | with_respect_to: Any, 17 | grad_roots: Any, 18 | context_restorer: Any, 19 | ): ... 20 | # pyre-ignore[11]: Annotation `torch.Tensor` is not defined as a type. 21 | def __next__(self) -> Optional[torch.Tensor]: ... 22 | def __iter__(self) -> "GradientGenerator": ... 23 | -------------------------------------------------------------------------------- /python/monarch/memory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | import itertools 9 | import os 10 | from pathlib import Path 11 | 12 | import torch 13 | from monarch.common.remote import remote 14 | 15 | 16 | PATH_KEY = "dir_snapshots" 17 | _counter = itertools.count() 18 | 19 | 20 | @remote(propagate="inspect") 21 | def record_memory_history() -> None: 22 | torch.cuda.memory._record_memory_history() 23 | 24 | 25 | def dump_memory_snapshot(*args, **kwargs) -> None: 26 | """ 27 | This function wraps torch.cuda.memory._dump_snapshot() to dump memory snapshot remotely. 28 | """ 29 | assert isinstance( 30 | kwargs.get(PATH_KEY, None), str 31 | ), f"{PATH_KEY} must be passed and must be a string to represent the path to save the memory snapshots." 32 | id = next(_counter) 33 | _memory_controller_dump(id, *args, **kwargs) 34 | 35 | 36 | @remote(propagate="inspect") 37 | def _memory_controller_dump(ident, *args, **kwargs) -> None: 38 | dir_path = Path(kwargs[PATH_KEY]).absolute() 39 | os.makedirs(dir_path, exist_ok=True) 40 | # This is not a synchronized call, so it is okay to call without device mesh. 41 | rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0 42 | snapshot_path = f"{dir_path}/snapshot_{rank}.pickle" 43 | torch.cuda.memory._dump_snapshot(filename=snapshot_path) 44 | -------------------------------------------------------------------------------- /python/monarch/parallel/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from monarch.parallel.pipelining.runtime import get_parameter_udf, PipelineParallelism 8 | 9 | __all__ = ["PipelineParallelism", "get_parameter_udf"] 10 | -------------------------------------------------------------------------------- /python/monarch/parallel/pipelining/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | -------------------------------------------------------------------------------- /python/monarch/proc_mesh.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import warnings 8 | 9 | warnings.warn( 10 | "monarch.proc_mesh is deprecated, please import from monarch.actor instead.", 11 | DeprecationWarning, 12 | stacklevel=2, 13 | ) 14 | 15 | from monarch._src.actor.proc_mesh import * # noqa 16 | -------------------------------------------------------------------------------- /python/monarch/random.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import os 8 | from typing import NamedTuple, Tuple 9 | 10 | import torch 11 | from monarch.common.remote import remote 12 | from monarch.common.tensor import Tensor 13 | 14 | 15 | class State(NamedTuple): 16 | cpu: Tensor 17 | cuda: Tensor 18 | 19 | 20 | @remote( 21 | propagate=lambda: ( 22 | torch.empty(5056, dtype=torch.uint8), 23 | torch.empty(16, dtype=torch.uint8), 24 | ) 25 | ) 26 | def _get_state() -> Tuple[torch.Tensor, torch.Tensor]: 27 | return (torch.get_rng_state(), torch.cuda.get_rng_state()) 28 | 29 | 30 | @remote(propagate=lambda state: None) 31 | def set_state(state: Tuple[Tensor, Tensor]): 32 | cpu, device = state 33 | torch.set_rng_state(cpu) 34 | torch.cuda.set_rng_state(device) 35 | 36 | 37 | @remote(propagate=lambda _: None) 38 | def _manual_seed(seed: torch.Tensor): 39 | torch.manual_seed(seed.item()) 40 | 41 | 42 | @remote(propagate=lambda: None) 43 | def make_deterministic(): 44 | torch.use_deterministic_algorithms(True) 45 | torch.backends.cudnn.deterministic = True 46 | torch.backends.cudnn.benchmark = False 47 | # env var for deterministic CuBLAS 48 | # https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html 49 | os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" 50 | 51 | 52 | def get_state() -> State: 53 | return State(*_get_state()) 54 | 55 | 56 | def new_state(seed: Tensor) -> State: 57 | orig = get_state() 58 | _manual_seed(seed) 59 | mine = get_state() 60 | set_state(orig) 61 | return mine 62 | -------------------------------------------------------------------------------- /python/monarch/simulator/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | -------------------------------------------------------------------------------- /python/monarch/simulator/config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | import contextlib 9 | 10 | META_VAL = [] 11 | 12 | 13 | @contextlib.contextmanager 14 | def set_meta(new_value): 15 | # Sets the metadata for any tasks created under this 16 | global META_VAL 17 | META_VAL.append(new_value) 18 | try: 19 | yield 20 | finally: 21 | META_VAL.pop() 22 | -------------------------------------------------------------------------------- /python/monarch/simulator/interface.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Union 8 | 9 | from monarch._src.actor.shape import NDSlice 10 | 11 | from monarch.common.client import Client as _Client 12 | from monarch.common.device_mesh import DeviceMesh 13 | 14 | from monarch.simulator.ir import IRGraph 15 | from monarch.simulator.simulator import ( 16 | SimulatorBackendMode, 17 | SimulatorController as _SimulatorController, 18 | SimulatorInterface, 19 | SimulatorTraceMode, 20 | ) 21 | 22 | 23 | def Simulator( 24 | hosts: int, 25 | gpus: int, 26 | *, 27 | simulate_mode: Union["str", SimulatorBackendMode] = SimulatorBackendMode.SIMULATE, 28 | trace_mode: Union["str", SimulatorTraceMode] = SimulatorTraceMode.STREAM_ONLY, 29 | upload_trace: bool = False, 30 | trace_path: str = "trace.json", 31 | command_history_path: str = "command_history.pkl", 32 | group_workers: bool = False, 33 | build_ir: bool = False, 34 | ) -> "SimulatorInterface": 35 | if isinstance(simulate_mode, str): 36 | simulate_mode = getattr(SimulatorBackendMode, simulate_mode.upper()) 37 | if isinstance(trace_mode, str): 38 | trace_mode = getattr(SimulatorTraceMode, trace_mode.upper()) 39 | 40 | ir = IRGraph() if build_ir else None 41 | ctrl = _SimulatorController( 42 | hosts * gpus, 43 | gpu_per_host=gpus, 44 | simulate_mode=simulate_mode, 45 | trace_mode=trace_mode, 46 | upload_trace=upload_trace, 47 | trace_path=trace_path, 48 | command_history_path=command_history_path, 49 | group_workers=group_workers, 50 | ir=ir, 51 | ) 52 | client = _Client(ctrl, ctrl.world_size, ctrl.gpu_per_host) 53 | dm = DeviceMesh( 54 | client, 55 | NDSlice(offset=0, sizes=[hosts, gpus], strides=[gpus, 1]), 56 | ("host", "gpu"), 57 | ) 58 | 59 | dm.exit = lambda: client.shutdown() 60 | return SimulatorInterface(dm, ctrl, ir) 61 | -------------------------------------------------------------------------------- /python/monarch/simulator/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | import os 9 | 10 | import numpy as np 11 | 12 | 13 | def file_path_with_iter(file_path: str, iter_count: int) -> str: 14 | dir_path = os.path.dirname(file_path) 15 | file_name, file_postfix = os.path.basename(file_path).split(".") 16 | file_name = f"{file_name}_{iter_count}.{file_postfix}" 17 | return os.path.join(dir_path, file_name) 18 | 19 | 20 | def compress_workers_range(workers) -> str: 21 | regions = [] 22 | start = workers[0] 23 | end = workers[0] 24 | sorted_workers = np.sort(workers) 25 | for i in range(1, len(sorted_workers)): 26 | if workers[i] == end + 1: 27 | end = workers[i] 28 | else: 29 | regions.append(f"[{start}-{end}]") 30 | start = workers[i] 31 | end = workers[i] 32 | regions.append(f"[{start}-{end}]") 33 | return " ".join(regions) 34 | 35 | 36 | def clean_name(name: str) -> str: 37 | if name.startswith("torch.ops.aten."): 38 | name = name[len("torch.ops.") :] # noqa: whitespace before ':' 39 | if name.endswith(".default"): 40 | name = name[: -len(".default")] 41 | return name 42 | -------------------------------------------------------------------------------- /python/monarch/timer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .execution_timer import ( 8 | execution_timer_start, 9 | execution_timer_stop, 10 | ExecutionTimer, 11 | get_execution_timer_average_ms, 12 | get_latest_timer_measurement, 13 | ) 14 | 15 | __all__ = [ 16 | "ExecutionTimer", 17 | "execution_timer_start", 18 | "execution_timer_stop", 19 | "get_latest_timer_measurement", 20 | "get_execution_timer_average_ms", 21 | ] 22 | -------------------------------------------------------------------------------- /python/monarch/timer/example_spmd.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | """An example that demonstrates how to use ExecutionTimer in a SPMD style program. 8 | 9 | Run this with: 10 | buck run //monarch/python/monarch/timer:example_spmd 11 | """ 12 | 13 | import time 14 | 15 | # pyre-strict 16 | 17 | import torch 18 | from monarch.timer import ExecutionTimer 19 | 20 | 21 | def main() -> None: 22 | # Check if CUDA is available 23 | if not torch.cuda.is_available(): 24 | print("CUDA is not available. Exiting.") 25 | return 26 | 27 | device = torch.device("cuda") 28 | 29 | num_iterations = 5 30 | 31 | a = torch.randn(1000, 1000, device=device) 32 | b = torch.randn(1000, 1000, device=device) 33 | 34 | # Warmup 35 | torch.matmul(a, b) 36 | torch.cuda.synchronize() 37 | 38 | cpu_timings = [] 39 | for _ in range(num_iterations): 40 | t0 = time.perf_counter() 41 | torch.matmul(a, b) 42 | cpu_timings.append(time.perf_counter() - t0) 43 | 44 | for _ in range(num_iterations): 45 | with ExecutionTimer.time("matrix_multiply"): 46 | torch.matmul(a, b) 47 | 48 | mean_cuda_ms = ExecutionTimer.summary()["matrix_multiply"]["mean_ms"] 49 | mean_perfcounter_ms = sum(cpu_timings) / len(cpu_timings) * 1000 50 | print("mean perf counter times: ", mean_perfcounter_ms) 51 | print("mean cuda times: ", mean_cuda_ms) 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /python/monarch/tools/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | -------------------------------------------------------------------------------- /python/monarch/tools/components/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | -------------------------------------------------------------------------------- /python/monarch/tools/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | from dataclasses import dataclass, field 9 | from typing import Any, Optional 10 | 11 | 12 | NOT_SET: str = "__NOT_SET__" 13 | 14 | 15 | @dataclass 16 | class Config: 17 | scheduler: str = NOT_SET 18 | scheduler_args: dict[str, Any] = field(default_factory=dict) 19 | workspace: Optional[str] = None 20 | dryrun: bool = False 21 | -------------------------------------------------------------------------------- /python/monarch/tools/config/defaults.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | """Defines defaults for ``monarch.tools``""" 10 | 11 | from typing import Callable, Optional 12 | 13 | from monarch.tools.components import hyperactor 14 | from monarch.tools.config import Config 15 | 16 | from torchx import specs 17 | from torchx.schedulers import ( 18 | docker_scheduler, 19 | kubernetes_scheduler, 20 | local_scheduler, 21 | SchedulerFactory, 22 | slurm_scheduler, 23 | ) 24 | 25 | 26 | def component_fn(scheduler: str) -> Callable[..., specs.AppDef]: 27 | """The default TorchX component function for the scheduler""" 28 | return hyperactor.proc_mesh 29 | 30 | 31 | def scheduler_factories() -> dict[str, SchedulerFactory]: 32 | """Supported schedulers (name -> scheduler static factory method)""" 33 | return { # pyre-ignore[7] 34 | # --- local schedulers (no multi-host support) --- 35 | "local_cwd": local_scheduler.create_scheduler, 36 | "local_docker": docker_scheduler.create_scheduler, 37 | # --- remote schedulers (yes multi-host support) --- 38 | "slurm": slurm_scheduler.create_scheduler, 39 | "k8s": kubernetes_scheduler.create_scheduler, 40 | } 41 | 42 | 43 | def config(scheduler: str, workspace: Optional[str] = None) -> Config: 44 | """The default :py:class:`~monarch.tools.config.Config` to use when submitting to the provided ``scheduler``.""" 45 | return Config(scheduler=scheduler, workspace=workspace) 46 | 47 | 48 | def dryrun_info_formatter(dryrun_info: specs.AppDryRunInfo) -> Callable[..., str]: 49 | """Used to attach a formatter to the dryrun info when running 50 | :py:function:`~monarch.tools.commands.create` in ``dryrun`` mode so that 51 | the returned ``AppDryrunInfo`` can be printed to console. 52 | """ 53 | # no-op, use the default formatter already attached to the dryrun info 54 | return dryrun_info._fmt 55 | -------------------------------------------------------------------------------- /python/monarch/tools/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | import os 9 | from typing import Optional 10 | 11 | 12 | class conda: 13 | """Conda related util functions.""" 14 | 15 | @staticmethod 16 | def active_env_dir() -> Optional[str]: 17 | """ 18 | Returns the currently active conda environment's directory. 19 | `None` if run outside of a conda environment. 20 | """ 21 | return os.getenv("CONDA_PREFIX") 22 | 23 | @staticmethod 24 | def active_env_name() -> Optional[str]: 25 | """ 26 | Returns the currently active conda environment name. 27 | `None` if run outside of a conda environment. 28 | """ 29 | env_name = os.getenv("CONDA_DEFAULT_ENV") 30 | 31 | if not env_name: 32 | # conda envs activated with metaconda doesn't set CODNA_DEFAULT_ENV so 33 | # fallback to CONDA_PREFIX which points to the path of the currently active conda environment 34 | # e.g./home/$USER/.conda/envs/{env_name} 35 | if env_dir := conda.active_env_dir(): 36 | env_name = os.path.basename(env_dir) 37 | 38 | return env_name 39 | -------------------------------------------------------------------------------- /python/monarch/worker/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | -------------------------------------------------------------------------------- /python/monarch/worker/lines.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | from contextlib import contextmanager 9 | from typing import Any, List 10 | 11 | 12 | class Lines: 13 | """ 14 | Simple way to emit code where we track a per-line context object. 15 | """ 16 | 17 | def __init__(self, context=None): 18 | self._lines: List[str] = [] 19 | self._context: List[Any] = [] 20 | self._current_context = context 21 | 22 | def get_context(self, lineno) -> Any: 23 | return self._context[lineno - 1] 24 | 25 | @contextmanager 26 | def context(self, obj: Any): 27 | old, self._current_context = self._current_context, obj 28 | try: 29 | yield 30 | finally: 31 | self._current_context = old 32 | 33 | def emit(self, lines: str) -> None: 34 | self._lines.extend(lines.split("\n")) 35 | while len(self._context) < len(self._lines): 36 | self._context.append(self._current_context) 37 | 38 | def emit_lines(self, lines: "Lines") -> None: 39 | """ 40 | Append another lines object on this one, 41 | preserving its per-line context. 42 | """ 43 | self._lines.extend(lines._lines) 44 | self._context.extend(lines._context) 45 | 46 | def text(self) -> str: 47 | return "\n".join(self._lines) 48 | -------------------------------------------------------------------------------- /python/monarch/worker/monitor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | import math 10 | import queue 11 | import threading 12 | from typing import Callable, Optional, Tuple 13 | 14 | from monarch_supervisor import TTL 15 | 16 | 17 | class Monitor: 18 | """A monitor is a thread that watches for reported events to expire.""" 19 | 20 | def __init__(self) -> None: 21 | self.thread = threading.Thread(target=self._main, daemon=True, name="monitor") 22 | self.events: queue.Queue[Tuple[Callable[[], None], Callable[[], float]]] = ( 23 | queue.Queue() 24 | ) 25 | self.events.put((lambda: None, TTL(None))) 26 | 27 | def start(self) -> None: 28 | """Start the monitor thread.""" 29 | self.thread.start() 30 | 31 | def _main(self) -> None: 32 | debug, ttl = self.events.get() 33 | while True: 34 | try: 35 | timeout = ttl() 36 | next_debug, next_ttl = self.events.get( 37 | timeout=None if timeout == math.inf else timeout 38 | ) 39 | except queue.Empty: 40 | debug() 41 | next_debug, next_ttl = self.events.get(timeout=None) 42 | 43 | debug, ttl = next_debug, next_ttl 44 | 45 | def __call__( 46 | self, 47 | debug_fn: Callable[[], None] = lambda: None, 48 | timeout: Optional[float] = None, 49 | ) -> None: 50 | """Start a new event with the provided timeout. 51 | If a timeout is specified, and a new event is not reported by before it expires, 52 | the provided debug_fn is called.""" 53 | self.events.put((debug_fn, TTL(timeout))) 54 | -------------------------------------------------------------------------------- /python/monarch/world_mesh.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | 9 | from typing import List 10 | 11 | from monarch._src.actor.shape import NDSlice 12 | 13 | from monarch.common.client import Client 14 | 15 | from monarch.common.device_mesh import DeviceMesh 16 | 17 | from monarch.controller.backend import ProcessBackend 18 | 19 | from monarch.controller.controller import Controller 20 | from monarch_supervisor import Context, Host 21 | 22 | 23 | def world_mesh( 24 | ctx: Context, 25 | hosts: List[Host], 26 | gpu_per_host: int, 27 | _processes=None, 28 | ) -> DeviceMesh: 29 | backend = ProcessBackend(ctx, hosts, gpu_per_host, _processes=_processes) 30 | client = Client(Controller(backend), backend.world_size, backend.gpu_per_host) 31 | return DeviceMesh( 32 | client, 33 | NDSlice(offset=0, sizes=[len(hosts), gpu_per_host], strides=[gpu_per_host, 1]), 34 | ("host", "gpu"), 35 | ) 36 | -------------------------------------------------------------------------------- /python/monarch_supervisor/_testing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | from typing import NamedTuple 9 | 10 | from monarch_supervisor import get_message_queue 11 | 12 | 13 | class Reply(NamedTuple): 14 | a: int 15 | b: int 16 | x: int 17 | 18 | 19 | def reply_hello(a, b, x): 20 | q = get_message_queue() 21 | q.send(Reply(a, b, x)) 22 | 23 | 24 | def echo(): 25 | q = get_message_queue() 26 | i = 0 27 | while True: 28 | sender, m = q.recv() 29 | if m == "exit": 30 | break 31 | assert m == i 32 | q.send(m) 33 | i += 1 34 | 35 | 36 | class Mapper: 37 | def map(self, items): 38 | return sum(x * 2 for x in items) 39 | 40 | def reduce(self, items): 41 | return sum(items) 42 | 43 | def finish(self, result): 44 | return result 45 | -------------------------------------------------------------------------------- /python/monarch_supervisor/diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/monarch/27b2e0d942ab98fc9c3e74f4ea512d8f28d515b8/python/monarch_supervisor/diagram.png -------------------------------------------------------------------------------- /python/monarch_supervisor/function_call.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | import importlib.util 9 | import sys 10 | 11 | from monarch_supervisor import _FunctionCall, get_message_queue 12 | 13 | if __name__ == "__main__": 14 | q = get_message_queue() 15 | _, call = q.recv() 16 | assert isinstance(call, _FunctionCall) 17 | filename, *rest = call.target.split(":", 1) 18 | if not rest: 19 | modulename, funcname = filename.rsplit(".", 1) 20 | module = importlib.import_module(modulename) 21 | else: 22 | spec = importlib.util.spec_from_file_location("__entry__", filename) 23 | assert spec is not None and spec.loader is not None 24 | module = importlib.util.module_from_spec(spec) 25 | # pyre-ignore[16] 26 | spec.loader.exec_module(module) 27 | sys.modules["__entry__"] = module 28 | funcname = rest[0] 29 | func = getattr(module, funcname) 30 | func(*call.args, **call.kwargs) 31 | -------------------------------------------------------------------------------- /python/monarch_supervisor/log_pstree.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | import logging 9 | import subprocess 10 | import sys 11 | from typing import Optional 12 | 13 | from monarch_supervisor.logging import gethostname, initialize_logging 14 | 15 | pid: str 16 | logger: logging.Logger = logging.getLogger(__name__) 17 | 18 | 19 | def extract_pss(pid: str) -> Optional[str]: 20 | try: 21 | with open(f"/proc/{pid}/smaps_rollup", "r") as f: 22 | for line in f.readlines(): 23 | if line.startswith("Pss:"): # Check if the line starts with 'Pss:' 24 | return " ".join(line.split()[1:3]) 25 | except Exception: 26 | pass 27 | return None 28 | 29 | 30 | def log_pstree_output(pid: int) -> None: 31 | pstree_output = subprocess.check_output(["pstree", "-Tap", str(pid)]).decode( 32 | "utf-8" 33 | ) 34 | lines = pstree_output.split("\n") 35 | logger.info("Process Info") 36 | for line in lines: 37 | if not line.strip(): 38 | continue 39 | parts = line.split(",") 40 | pids = parts[1].split()[0] 41 | mem = extract_pss(pids) 42 | logger.info(f"{line} {mem}") 43 | 44 | 45 | if __name__ == "__main__": 46 | (pid,) = sys.argv[1:] 47 | initialize_logging(f"{gethostname()} host-manager") 48 | log_pstree_output(int(pid)) 49 | -------------------------------------------------------------------------------- /python/monarch_supervisor/python_executable.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import importlib.resources 8 | import os 9 | import sys 10 | 11 | try: 12 | from __manifest__ import fbmake # noqa 13 | 14 | IN_PAR = True 15 | except ImportError: 16 | IN_PAR = False 17 | 18 | PYTHON_EXECUTABLE: str 19 | if IN_PAR: 20 | # The worker bootstrap binary will import this supervisor lib. When that 21 | # happens don't try to search for the bootstrap binary again, just use the 22 | # current executable. 23 | import __main__ as main_module # @manual 24 | 25 | if hasattr(main_module, "__MONARCH_TENSOR_WORKER_ENV__"): 26 | PYTHON_EXECUTABLE = os.environ["FB_XAR_INVOKED_NAME"] 27 | else: 28 | try: 29 | with importlib.resources.path( 30 | "monarch_tensor_worker_env", "worker_env" 31 | ) as path: 32 | if not path.exists(): 33 | raise ImportError() 34 | PYTHON_EXECUTABLE = str(path) 35 | except ImportError: 36 | raise ImportError( 37 | "Monarch worker env not found, please define a custom 'monarch_tensor_worker_env' or " 38 | "add '//monarch/python/monarch_supervisor/worker:default_worker_env' " 39 | "to your binary dependencies in TARGETS" 40 | ) 41 | else: 42 | PYTHON_EXECUTABLE = sys.executable 43 | -------------------------------------------------------------------------------- /python/monarch_supervisor/worker/worker_env.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | import runpy 9 | import sys 10 | 11 | __MONARCH_TENSOR_WORKER_ENV__ = True 12 | 13 | 14 | def main() -> None: 15 | assert sys.argv[1] == "-m" 16 | main_module = sys.argv[2] 17 | 18 | # Remove the -m and the main module from the command line arguments before 19 | # forwarding 20 | sys.argv[1:] = sys.argv[3:] 21 | # pyre-fixme[16]: Module `runpy` has no attribute `_run_module_as_main`. 22 | runpy._run_module_as_main(main_module, alter_argv=False) 23 | 24 | 25 | if __name__ == "__main__": 26 | main() 27 | -------------------------------------------------------------------------------- /python/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/monarch/27b2e0d942ab98fc9c3e74f4ea512d8f28d515b8/python/tests/__init__.py -------------------------------------------------------------------------------- /python/tests/_monarch/test_actor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | import time 10 | 11 | from monarch._rust_bindings.monarch_hyperactor.actor import ( 12 | PythonMessage, 13 | PythonMessageKind, 14 | ) 15 | 16 | 17 | def test_python_message() -> None: 18 | """ 19 | Verifies that PythonMessage can be constructed reasonably fast. 20 | """ 21 | method: str = "test_method" 22 | payload: str = "a" * 2**30 # 1gb 23 | blob: bytes = payload.encode("utf-8") 24 | t = time.time() 25 | PythonMessage(PythonMessageKind.CallMethod(method, None), blob) 26 | t_spent = time.time() - t 27 | assert t_spent < 1 28 | -------------------------------------------------------------------------------- /python/tests/_monarch/test_client.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | from unittest import TestCase 10 | 11 | import torch 12 | from monarch._rust_bindings.monarch_extension import client 13 | 14 | from monarch._rust_bindings.monarch_hyperactor.proc import ActorId 15 | from pyre_extensions import none_throws 16 | 17 | 18 | class TestClient(TestCase): 19 | def test_simple_with_error_response(self) -> None: 20 | err = client.Error.new_for_unit_test( 21 | 7, 22 | 8, 23 | ActorId(world_name="test", rank=0, actor_name="actor"), 24 | "test error", 25 | ) 26 | resp = client.WorkerResponse.new_for_unit_test( 27 | seq=10, 28 | response=err, 29 | ) 30 | self.assertTrue(resp.is_exception()) 31 | exc = none_throws(resp.exception()) 32 | assert isinstance(exc, client.Error) 33 | 34 | self.assertEqual(exc.backtrace, "test error") 35 | self.assertEqual(resp.result(), None) 36 | self.assertEqual(resp.seq, 10) 37 | 38 | def test_simple_with_result_response(self) -> None: 39 | resp = client.WorkerResponse.new_for_unit_test( 40 | seq=11, 41 | response={"test": 1}, 42 | ) 43 | self.assertFalse(resp.is_exception()) 44 | self.assertEqual(resp.exception(), None) 45 | self.assertEqual(resp.result(), {"test": 1}) 46 | self.assertEqual(resp.seq, 11) 47 | 48 | def test_tensor(self) -> None: 49 | tensor = torch.rand(3) 50 | resp = client.WorkerResponse.new_for_unit_test( 51 | seq=11, 52 | response={"result": tensor}, 53 | ) 54 | self.assertTrue(torch.equal(resp.result()["result"], tensor)) 55 | -------------------------------------------------------------------------------- /python/tests/dispatch_bench_helper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | import torch 9 | 10 | from monarch.common.remote import remote 11 | 12 | 13 | def run_loop_local(n_iters, tensor_shape=(2, 2)): 14 | local = torch.zeros(*tensor_shape) 15 | ones = torch.ones(*tensor_shape) 16 | for _ in range(n_iters): 17 | local = ones + local 18 | return local 19 | 20 | 21 | def _run_loop(*args, **kwargs): 22 | return torch.ones(args[1]) 23 | 24 | 25 | run_loop = remote("tests.dispatch_bench_helper.run_loop_local", propagate=_run_loop) 26 | -------------------------------------------------------------------------------- /python/tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-timeout 3 | pytest-asyncio 4 | pytest-xdist 5 | pyright 6 | -------------------------------------------------------------------------------- /python/tests/simulator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/monarch/27b2e0d942ab98fc9c3e74f4ea512d8f28d515b8/python/tests/simulator/__init__.py -------------------------------------------------------------------------------- /python/tests/sleep_binary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Meta Platforms, Inc. and affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | 8 | # pyre-strict 9 | 10 | """ 11 | A simple binary that calls the sleep_indefinitely_for_unit_tests function from the monarch extension. 12 | This is used to test the signal handling behavior of signal_safe_block_on. 13 | """ 14 | 15 | import sys 16 | 17 | from monarch._rust_bindings.monarch_hyperactor.runtime import ( # @manual 18 | sleep_indefinitely_for_unit_tests, 19 | ) 20 | 21 | 22 | def main() -> None: 23 | print("Starting sleep_binary. Process will sleep indefinitely until interrupted.") 24 | sys.stdout.flush() # Ensure the message is printed before we sleep 25 | 26 | try: 27 | # This will sleep indefinitely until interrupted by a signal 28 | sleep_indefinitely_for_unit_tests() 29 | except KeyboardInterrupt: 30 | print("Received KeyboardInterrupt, exiting.") 31 | sys.exit(0) 32 | 33 | 34 | if __name__ == "__main__": 35 | main() 36 | -------------------------------------------------------------------------------- /python/tests/test_alloc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | from unittest import IsolatedAsyncioTestCase 10 | 11 | from monarch import ProcessAllocator 12 | from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension 13 | AllocConstraints, 14 | AllocSpec, 15 | ) 16 | 17 | 18 | class TestAlloc(IsolatedAsyncioTestCase): 19 | async def test_basic(self) -> None: 20 | cmd = "echo hello" 21 | allocator = ProcessAllocator(cmd) 22 | spec = AllocSpec(AllocConstraints(), replica=2) 23 | alloc = await allocator.allocate(spec) 24 | 25 | print(alloc) 26 | -------------------------------------------------------------------------------- /python/tests/test_sim_backend.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | 9 | from contextlib import contextmanager 10 | from typing import Generator, Optional 11 | from unittest import TestCase 12 | 13 | import pytest 14 | 15 | import torch 16 | from monarch import fetch_shard 17 | from monarch.common.device_mesh import DeviceMesh 18 | from monarch.sim_mesh import sim_mesh 19 | 20 | 21 | @contextmanager 22 | def local_sim_mesh( 23 | hosts: int = 1, 24 | # TODO: support multiple gpus in a mesh. 25 | gpu_per_host: int = 1, 26 | activate: bool = True, 27 | ) -> Generator[DeviceMesh, None, None]: 28 | dms = sim_mesh(n_meshes=1, hosts=hosts, gpus_per_host=gpu_per_host) 29 | dm = dms[0] 30 | try: 31 | if activate: 32 | with dm.activate(): 33 | yield dm 34 | else: 35 | yield dm 36 | dm.exit() 37 | except Exception: 38 | dm.client._shutdown = True 39 | raise 40 | 41 | 42 | # oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited 43 | @pytest.mark.oss_skip 44 | class TestSimBackend(TestCase): 45 | def test_local_mesh_setup(self): 46 | with local_sim_mesh(): 47 | t = torch.zeros(3, 4) 48 | t.add_(1) 49 | local_t = fetch_shard(t).result() 50 | # consider support specifying the return value in the mock worker. 51 | assert local_t is not None 52 | -------------------------------------------------------------------------------- /python/tests/tools/test_network.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | import socket 10 | import unittest 11 | from unittest import mock 12 | 13 | from monarch.tools import network 14 | 15 | 16 | class TestNetwork(unittest.TestCase): 17 | def test_network_ipv4_fallback(self) -> None: 18 | with mock.patch( 19 | "socket.getaddrinfo", 20 | side_effect=[ 21 | socket.gaierror, 22 | [ 23 | ( 24 | socket.AF_INET, 25 | socket.SOCK_STREAM, 26 | socket.IPPROTO_TCP, 27 | "", 28 | ("123.45.67.89", 8080), 29 | ) 30 | ], 31 | ], 32 | ): 33 | self.assertEqual( 34 | "123.45.67.89:8080", network.get_sockaddr("foo.bar.facebook.com", 8080) 35 | ) 36 | 37 | def test_network_ipv6(self) -> None: 38 | with mock.patch( 39 | "socket.getaddrinfo", 40 | return_value=( 41 | [ 42 | ( 43 | socket.AF_INET6, 44 | socket.SOCK_STREAM, 45 | socket.IPPROTO_TCP, 46 | "", 47 | ("1234:ab00:567c:89d:abcd:0:328:0", 0, 0, 0), 48 | ) 49 | ] 50 | ), 51 | ): 52 | self.assertEqual( 53 | "[1234:ab00:567c:89d:abcd:0:328:0]:8080", 54 | network.get_sockaddr("foo.bar.facebook.com", 8080), 55 | ) 56 | 57 | def test_network(self) -> None: 58 | # since we patched `socket.getaddrinfo` above 59 | # don't patch and just make sure things don't error out 60 | self.assertIsNotNone(network.get_sockaddr(socket.getfqdn(), 8080)) 61 | -------------------------------------------------------------------------------- /python/tests/tools/test_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | import os 9 | import unittest 10 | from unittest import mock 11 | 12 | from monarch.tools.utils import conda 13 | 14 | 15 | class TestCondaUtils(unittest.TestCase): 16 | def test_conda_active_env_name(self) -> None: 17 | with mock.patch.dict(os.environ, {"CONDA_DEFAULT_ENV": "foo-py3"}, clear=True): 18 | self.assertEqual(conda.active_env_name(), "foo-py3") 19 | 20 | with mock.patch.dict( 21 | os.environ, {"CONDA_PREFIX": "/home/USER/.conda/envs/bar-py3"}, clear=True 22 | ): 23 | self.assertEqual(conda.active_env_name(), "bar-py3") 24 | 25 | with mock.patch.dict(os.environ, {}, clear=True): 26 | self.assertIsNone(conda.active_env_name()) 27 | 28 | def test_conda_active_env_dir(self) -> None: 29 | with mock.patch.dict( 30 | os.environ, {"CONDA_PREFIX": "/home/USER/.conda/envs/foo"}, clear=True 31 | ): 32 | self.assertEqual(conda.active_env_dir(), "/home/USER/.conda/envs/foo") 33 | 34 | with mock.patch.dict(os.environ, {}, clear=True): 35 | self.assertIsNone(conda.active_env_dir()) 36 | -------------------------------------------------------------------------------- /python/tests/tools/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | import contextlib 9 | import io 10 | from typing import Generator 11 | 12 | 13 | @contextlib.contextmanager 14 | def capture_stdout() -> Generator[io.StringIO, None, None]: 15 | with io.StringIO() as buf, contextlib.redirect_stdout(buf): 16 | yield buf 17 | -------------------------------------------------------------------------------- /rdmacore-sys/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rdmacore-sys" 3 | version = "0.0.0" 4 | authors = ["Facebook"] 5 | edition = "2021" 6 | license = "MIT" 7 | links = "ibverbs" 8 | 9 | [dependencies] 10 | cxx = "1.0.119" 11 | serde = { version = "1.0.185", features = ["derive", "rc"] } 12 | 13 | [build-dependencies] 14 | bindgen = "0.70.1" 15 | -------------------------------------------------------------------------------- /rdmacore-sys/src/wrapper.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include 10 | #include 11 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | pyzmq 3 | requests 4 | numpy 5 | pyre-extensions 6 | cloudpickle 7 | torchx-nightly 8 | lark 9 | tabulate 10 | -------------------------------------------------------------------------------- /rust-toolchain: -------------------------------------------------------------------------------- 1 | # @rustc_version: rustc 1.87.0-nightly (920d95eaf 2025-03-28) 2 | [toolchain] 3 | channel = "nightly-2025-05-09" 4 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | # Get help on options with `rustfmt --help=config` 2 | # Please keep these in alphabetical order. 3 | edition = "2021" 4 | format_code_in_doc_comments = true 5 | group_imports = "StdExternalCrate" 6 | imports_granularity = "Item" 7 | merge_derives = false 8 | style_edition = "2024" 9 | use_field_init_shorthand = true 10 | -------------------------------------------------------------------------------- /timed_test/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/timed_test:[timed_test,timed_test_test] 2 | 3 | [package] 4 | name = "timed_test" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [lib] 11 | test = false 12 | doctest = false 13 | proc-macro = true 14 | 15 | [[test]] 16 | name = "timed_test_test" 17 | path = "tests/basic.rs" 18 | 19 | [dependencies] 20 | quote = "1.0.29" 21 | syn = { version = "2.0.101", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } 22 | 23 | [dev-dependencies] 24 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 25 | -------------------------------------------------------------------------------- /timed_test/tests/basic.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use timed_test::async_timed_test; 10 | 11 | #[async_timed_test(timeout_secs = 5)] 12 | async fn good() { 13 | #[allow(clippy::disallowed_methods)] 14 | tokio::time::sleep(tokio::time::Duration::from_secs(2)).await; 15 | } 16 | 17 | #[async_timed_test(timeout_secs = 1)] 18 | #[should_panic] 19 | async fn bad() { 20 | #[allow(clippy::disallowed_methods)] 21 | tokio::time::sleep(tokio::time::Duration::from_secs(2)).await; 22 | } 23 | 24 | #[async_timed_test(timeout_secs = 1)] 25 | #[should_panic] 26 | async fn very_bad() { 27 | loop { 28 | #[allow(clippy::disallowed_methods)] 29 | std::thread::sleep(std::time::Duration::from_secs(1)); 30 | } 31 | } 32 | 33 | #[async_timed_test(timeout_secs = 60)] 34 | #[should_panic] 35 | async fn panics_correctly() { 36 | panic!(); 37 | } 38 | -------------------------------------------------------------------------------- /tools/rust/ossconfigs/clippy.toml: -------------------------------------------------------------------------------- 1 | disallowed-methods = [ 2 | { path = "tokio::time::sleep", reason = "use `hyperactor::clock::Clock::sleep` instead." }, 3 | { path = "std::thread::sleep", reason = "use `hyperactor::clock::Clock::sleep` instead." }, 4 | { path = "tokio::time::Instant::now", reason = "use `hyperactor::clock::Clock::now` instead." }, 5 | { path = "std::time::SystemTime::now", reason = "use `hyperactor::clock::Clock::system_time_now` instead." }, 6 | { path = "tokio::time::timeout", reason = "use `hyperactor::clock::Clock::timeout` instead." }, 7 | ] 8 | -------------------------------------------------------------------------------- /torch-sys-cuda/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/torch-sys-cuda:torch-sys-cuda 2 | 3 | [package] 4 | name = "torch-sys-cuda" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | links = "torch_cuda" 10 | 11 | [dependencies] 12 | cxx = "1.0.119" 13 | derive_more = { version = "1.0.0", features = ["full"] } 14 | fxhash = "0.2.1" 15 | nccl-sys = { path = "../nccl-sys" } 16 | serde = { version = "1.0.185", features = ["derive", "rc"] } 17 | thiserror = "2.0.12" 18 | torch-sys = { version = "0.0.0", path = "../torch-sys" } 19 | 20 | [build-dependencies] 21 | cxx-build = "1.0.119" 22 | pyo3-build-config = "0.24.2" 23 | 24 | [features] 25 | cuda = [] 26 | default = ["cuda"] 27 | -------------------------------------------------------------------------------- /torch-sys-cuda/src/bridge.cpp: -------------------------------------------------------------------------------- 1 | #include "monarch/torch-sys-cuda/src/bridge.h" 2 | 3 | namespace monarch { 4 | std::unique_ptr 5 | create_cuda_event(bool enable_timing, bool blocking, bool interprocess) { 6 | unsigned int flags = (blocking ? cudaEventBlockingSync : cudaEventDefault) | 7 | (enable_timing ? cudaEventDefault : cudaEventDisableTiming) | 8 | (interprocess ? cudaEventInterprocess : cudaEventDefault); 9 | 10 | return std::make_unique(flags); 11 | } 12 | 13 | std::shared_ptr get_current_stream( 14 | c10::DeviceIndex device) { 15 | return std::make_shared( 16 | c10::cuda::getCurrentCUDAStream(device)); 17 | } 18 | 19 | std::shared_ptr create_stream( 20 | c10::DeviceIndex device, 21 | int32_t priority) { 22 | return std::make_shared( 23 | c10::cuda::getStreamFromPool((const int)priority, device)); 24 | } 25 | 26 | void set_current_stream(const c10::cuda::CUDAStream& stream) { 27 | auto device = c10::cuda::current_device(); 28 | if (device != stream.device_index()) { 29 | c10::cuda::set_device(stream.device_index()); 30 | } 31 | at::cuda::setCurrentCUDAStream(stream); 32 | } 33 | } // namespace monarch 34 | -------------------------------------------------------------------------------- /torch-sys-cuda/src/bridge.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include // @manual=//caffe2:torch-cpp 12 | #include // @manual 13 | #include // @manual=//caffe2:torch-cpp 14 | 15 | namespace monarch { 16 | 17 | std::unique_ptr 18 | create_cuda_event(bool enable_timing, bool blocking, bool interprocess); 19 | 20 | std::shared_ptr get_current_stream( 21 | c10::DeviceIndex device); 22 | 23 | std::shared_ptr create_stream( 24 | c10::DeviceIndex device, 25 | int32_t priority); 26 | 27 | void set_current_stream(const c10::cuda::CUDAStream& stream); 28 | 29 | /// This function exists because ncclConfig initialization requires the use of 30 | /// a macro. We cannot reference the macro directly from Rust code, so we wrap 31 | /// the macro use in a function and bind that to Rust instead. 32 | inline ncclConfig_t make_nccl_config() { 33 | ncclConfig_t ret = NCCL_CONFIG_INITIALIZER; 34 | return ret; 35 | } 36 | 37 | } // namespace monarch 38 | -------------------------------------------------------------------------------- /torch-sys-cuda/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | /// A companion to the `torch-sys` crate that provides bindings for 10 | /// CUDA-specific functionality from libtorch. This crate is separated out to 11 | /// make it easier for clients who want to avoid compiling CUDA code. 12 | /// 13 | /// The same safety logic described in the `torch-sys` crate applies here. 14 | mod bridge; 15 | pub mod cuda; 16 | pub mod nccl; 17 | -------------------------------------------------------------------------------- /torch-sys/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/torch-sys:torch-sys 2 | 3 | [package] 4 | name = "torch-sys" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | links = "torch" 10 | 11 | [dependencies] 12 | anyhow = "1.0.98" 13 | async-trait = "0.1.86" 14 | atomic_refcell = "0.1.13" 15 | bincode = "1.3.3" 16 | cxx = "1.0.119" 17 | derive_more = { version = "1.0.0", features = ["full"] } 18 | monarch_types = { version = "0.0.0", path = "../monarch_types" } 19 | nccl-sys = { path = "../nccl-sys", optional = true } 20 | paste = "1.0.14" 21 | pyo3 = { version = "0.24", features = ["anyhow", "multiple-pymethods"] } 22 | regex = "1.11.1" 23 | serde = { version = "1.0.185", features = ["derive", "rc"] } 24 | thiserror = "2.0.12" 25 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 26 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 27 | 28 | [build-dependencies] 29 | bindgen = "0.70.1" 30 | cxx-build = "1.0.119" 31 | pyo3-build-config = "0.24.2" 32 | 33 | [features] 34 | cuda = ["dep:nccl-sys"] 35 | -------------------------------------------------------------------------------- /torch-sys/README.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | 3 | See the source documentation or `bunnylol rustdoc torch-sys` to see docs. 4 | 5 | # Cargo build 6 | 7 | The cargo build requires that you have a version of PyTorch installed in your 8 | Python environment. To get set up, run the following on your devgpu: 9 | 10 | ```sh 11 | # get conda on devserver 12 | sudo feature install genai_conda 13 | 14 | # Set up conda env 15 | conda create -n monarch 16 | conda activate monarch 17 | 18 | # install pytorch 19 | conda install pytorch pytorch-cuda=12.4 -c pytorch -c nvidia 20 | 21 | # install cuda toolkit on devserver (requires devgpu) 22 | sudo dnf install cuda-12-0 23 | 24 | # install nccl on devserver (requires devgpu) 25 | sudo dnf install libnccl-devel 26 | 27 | # install libclang on devserver (needed for rust-bindgen) 28 | sudo dnf install clang-devel 29 | 30 | # in monarch/torch-sys 31 | cargo test 32 | ``` 33 | -------------------------------------------------------------------------------- /torch-sys/src/bindings.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | include!(concat!(env!("OUT_DIR"), "/bindings.rs")); 10 | -------------------------------------------------------------------------------- /torch-sys/src/pyobject.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use cxx::type_id; 10 | use pyo3::prelude::*; 11 | 12 | #[repr(transparent)] 13 | pub(crate) struct FFIPyObject(*mut pyo3::ffi::PyObject); 14 | 15 | // SAFETY: This is just a pointer to a PyObject and the pointer is 16 | // never dereferenced directly. It can only be converted to pyo3::PyObject 17 | // and then dereferenced through that. PyO3 manages the access patterns to 18 | // the underlying PyObject. 19 | // Additionally, we make the assumption that ownership of the underlying 20 | // PyObject is transferred with the it. 21 | // Hence FFIPyObject should always be created from an owned pointer. 22 | unsafe impl cxx::ExternType for FFIPyObject { 23 | type Id = type_id!("monarch::FFIPyObject"); 24 | type Kind = cxx::kind::Trivial; 25 | } 26 | 27 | impl From> for FFIPyObject { 28 | #[inline] 29 | fn from(obj: Py) -> Self { 30 | Self(obj.into_ptr()) 31 | } 32 | } 33 | 34 | impl From> for FFIPyObject { 35 | #[inline] 36 | fn from(obj: Bound<'_, T>) -> Self { 37 | Self(obj.into_ptr()) 38 | } 39 | } 40 | 41 | impl From<&Bound<'_, T>> for FFIPyObject { 42 | #[inline] 43 | fn from(obj: &Bound<'_, T>) -> Self { 44 | Self(obj.clone().into_ptr()) 45 | } 46 | } 47 | 48 | impl<'py> IntoPyObject<'py> for FFIPyObject { 49 | type Target = PyAny; 50 | type Output = Bound<'py, Self::Target>; 51 | type Error = PyErr; 52 | 53 | fn into_pyobject(self, py: Python<'py>) -> Result { 54 | // SAFETY: Pull in the `PyObject` from C/C++. 55 | Ok(unsafe { PyObject::from_owned_ptr(py, self.0) }.into_bound(py)) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /torch-sys/src/torch.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | // Going for a smaller set of headers until more enums are needed 12 | #include 13 | #include 14 | #include 15 | --------------------------------------------------------------------------------