├── .dockerignore ├── .flake8 ├── .github └── workflows │ ├── test.yml │ └── wheels.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Cargo.toml ├── Dockerfile ├── LICENSE ├── README.md ├── build-requirements.txt ├── clippy.toml ├── controller ├── Cargo.toml ├── build.rs └── src │ ├── bootstrap.rs │ ├── history.rs │ ├── lib.rs │ └── main.rs ├── examples ├── __init__.py ├── grpo_actor.py └── notebooks │ ├── README.md │ ├── ping_pong.ipynb │ └── spmd_ddp.ipynb ├── hyper ├── Cargo.toml ├── hyper.dotslash.py ├── src │ ├── commands.rs │ ├── commands │ │ ├── demo.rs │ │ ├── procs.rs │ │ ├── serve.rs │ │ ├── show.rs │ │ └── top.rs │ ├── lib.rs │ ├── main.rs │ ├── tui │ │ ├── mod.rs │ │ └── top.rs │ └── utils │ │ ├── mod.rs │ │ └── system_address.rs └── tests │ └── demo_test.py ├── hyperactor ├── Cargo.toml ├── example │ └── derive.rs └── src │ ├── accum.rs │ ├── actor │ ├── mod.rs │ └── remote.rs │ ├── cap.rs │ ├── channel │ ├── local.rs │ ├── mod.rs │ ├── net.rs │ └── sim.rs │ ├── checkpoint.rs │ ├── clock.rs │ ├── config.rs │ ├── data.rs │ ├── init.rs │ ├── lib.rs │ ├── mailbox │ ├── durable_mailbox_sender.rs │ ├── mailbox_admin_message.rs │ └── mod.rs │ ├── message.rs │ ├── metrics.rs │ ├── panic_handler.rs │ ├── parse.rs │ ├── proc.rs │ ├── reference.rs │ ├── simnet.rs │ ├── spawn.rs │ ├── supervision.rs │ ├── sync │ ├── flag.rs │ ├── mod.rs │ └── monitor.rs │ └── test_utils │ ├── mod.rs │ ├── pingpong.rs │ ├── proc_supervison.rs │ ├── process_assertion.rs │ └── tracing.rs ├── hyperactor_extension ├── Cargo.toml └── src │ ├── alloc.rs │ ├── lib.rs │ └── telemetry.rs ├── hyperactor_macros ├── Cargo.toml ├── build.rs ├── src │ └── lib.rs └── tests │ └── basic.rs ├── hyperactor_mesh ├── Cargo.toml ├── examples │ └── dining_philosophers.rs ├── src │ ├── actor_mesh.rs │ ├── alloc │ │ ├── local.rs │ │ ├── logtailer.rs │ │ ├── mod.rs │ │ ├── process.rs │ │ └── remoteprocess.rs │ ├── assign.rs │ ├── bootstrap.rs │ ├── comm │ │ ├── mod.rs │ │ └── multicast.rs │ ├── lib.rs │ ├── mesh.rs │ ├── mesh_selection.rs │ ├── metrics.rs │ ├── proc_mesh │ │ ├── mesh_agent.rs │ │ └── mod.rs │ ├── reference.rs │ ├── shortuuid.rs │ └── test_utils.rs └── test │ └── bootstrap.rs ├── hyperactor_mesh_macros ├── Cargo.toml └── src │ └── lib.rs ├── hyperactor_multiprocess ├── Cargo.toml └── src │ ├── lib.rs │ ├── ping_pong.rs │ ├── proc_actor.rs │ ├── pyspy.rs │ ├── scheduler.rs │ ├── supervision.rs │ ├── system.rs │ └── system_actor.rs ├── hyperactor_telemetry ├── Cargo.toml ├── src │ ├── lib.rs │ ├── otel.rs │ ├── pool.rs │ ├── recorder.rs │ └── spool.rs ├── stubs │ ├── fbinit │ │ └── src │ │ │ └── lib.rs │ └── scuba │ │ └── src │ │ └── lib.rs └── tester │ ├── Cargo.toml │ └── main.rs ├── monarch_extension ├── Cargo.toml ├── build.rs └── src │ ├── client.rs │ ├── controller.rs │ ├── convert.rs │ ├── debugger.rs │ ├── lib.rs │ ├── panic.rs │ ├── simulator_client.rs │ └── tensor_worker.rs ├── monarch_hyperactor ├── Cargo.toml └── src │ ├── actor.rs │ ├── actor_mesh.rs │ ├── alloc.rs │ ├── bin │ └── process_allocator │ │ ├── common.rs │ │ └── main.rs │ ├── bootstrap.rs │ ├── lib.rs │ ├── mailbox.rs │ ├── ndslice.rs │ ├── proc.rs │ ├── proc_mesh.rs │ ├── runtime.rs │ ├── selection.rs │ └── shape.rs ├── monarch_messages ├── Cargo.toml ├── build.rs ├── src │ ├── client.rs │ ├── controller.rs │ ├── debugger.rs │ ├── lib.rs │ ├── wire_value.rs │ └── worker.rs └── test_utils.py ├── monarch_meta_extension ├── Cargo.toml └── src │ ├── alloc.rs │ ├── alloc_mock.rs │ └── lib.rs ├── monarch_rdma ├── Cargo.toml ├── examples │ ├── Cargo.toml │ ├── bootstrap.rs │ ├── main.rs │ └── parameter_server.rs └── src │ ├── ibverbs_primitives.rs │ ├── lib.rs │ ├── rdma_buffer.rs │ ├── rdma_components.rs │ └── rdma_manager_actor.rs ├── monarch_simulator ├── Cargo.toml └── src │ ├── bootstrap.rs │ ├── collective_coordinator.rs │ ├── controller.rs │ ├── lib.rs │ ├── main.rs │ ├── simulator.rs │ └── worker.rs ├── monarch_tensor_worker ├── Cargo.toml ├── build.rs ├── src │ ├── bootstrap.rs │ ├── borrow.rs │ ├── comm.rs │ ├── device_mesh.rs │ ├── lib.rs │ ├── pipe.rs │ ├── py_pipe.rs │ ├── stream.rs │ └── test_util.rs ├── test_utils.py └── test_worker_main.py ├── monarch_types ├── Cargo.toml └── src │ ├── lib.rs │ ├── pyobject.rs │ ├── python.rs │ └── pytree.rs ├── nccl-sys ├── Cargo.toml ├── build.rs └── src │ ├── lib.rs │ └── nccl.h ├── ndslice ├── Cargo.toml └── src │ ├── lib.rs │ ├── reshape.rs │ ├── selection │ ├── mod.rs │ ├── normal.rs │ ├── parse.rs │ ├── pretty.rs │ ├── routing.rs │ ├── test_utils.rs │ └── token_parser.rs │ ├── shape.rs │ ├── slice.rs │ └── strategy.rs ├── pyproject.toml ├── python ├── monarch │ ├── __init__.py │ ├── _monarch │ │ ├── __init__.py │ │ ├── hyperactor │ │ │ └── __init__.py │ │ ├── selection │ │ │ └── __init__.py │ │ └── worker │ │ │ ├── __init__.py │ │ │ ├── debugger.py │ │ │ └── logging.py │ ├── _rust_bindings │ │ ├── __init__.pyi │ │ ├── controller │ │ │ └── bootstrap.pyi │ │ ├── hyperactor_extension │ │ │ ├── alloc.pyi │ │ │ └── telemetry.pyi │ │ ├── monarch_extension │ │ │ ├── __init__.pyi │ │ │ ├── client.pyi │ │ │ ├── controller.pyi │ │ │ ├── debugger.pyi │ │ │ ├── panic.pyi │ │ │ ├── simulator_client.pyi │ │ │ └── tensor_worker.pyi │ │ ├── monarch_hyperactor │ │ │ ├── actor.pyi │ │ │ ├── actor_mesh.pyi │ │ │ ├── alloc.pyi │ │ │ ├── bootstrap.pyi │ │ │ ├── mailbox.pyi │ │ │ ├── proc.pyi │ │ │ ├── proc_mesh.pyi │ │ │ ├── runtime.pyi │ │ │ ├── selection.pyi │ │ │ └── shape.pyi │ │ ├── monarch_messages │ │ │ └── debugger.pyi │ │ ├── monarch_tensor_worker │ │ │ └── bootstrap.pyi │ │ └── old.pyi │ ├── _testing.py │ ├── actor_mesh.py │ ├── allocator.py │ ├── bootstrap_main.py │ ├── builtins │ │ ├── __init__.py │ │ ├── log.py │ │ └── random.py │ ├── cached_remote_function.py │ ├── common │ │ ├── _C.pyi │ │ ├── __init__.py │ │ ├── _coalescing.py │ │ ├── _device_utils.py │ │ ├── _tensor_to_table.py │ │ ├── base_tensor.py │ │ ├── borrows.py │ │ ├── client.py │ │ ├── constants.py │ │ ├── context_manager.py │ │ ├── controller_api.py │ │ ├── device_mesh.py │ │ ├── fake.py │ │ ├── function.py │ │ ├── function_caching.py │ │ ├── future.py │ │ ├── init.cpp │ │ ├── invocation.py │ │ ├── mast.py │ │ ├── messages.py │ │ ├── mock_cuda.cpp │ │ ├── mock_cuda.h │ │ ├── mock_cuda.py │ │ ├── opaque_ref.py │ │ ├── pickle_flatten.py │ │ ├── pipe.py │ │ ├── process_group.py │ │ ├── recording.py │ │ ├── reference.py │ │ ├── remote.py │ │ ├── selection.py │ │ ├── shape.py │ │ ├── stream.py │ │ ├── tensor.py │ │ ├── tensor_factory.py │ │ └── tree.py │ ├── controller │ │ ├── __init__.py │ │ ├── backend.py │ │ ├── controller.py │ │ ├── debugger.py │ │ ├── history.py │ │ └── rust_backend │ │ │ ├── __init__.py │ │ │ └── controller.py │ ├── fetch.py │ ├── future.py │ ├── gradient │ │ ├── __init__.py │ │ ├── _gradient_generator.cpp │ │ └── _gradient_generator.pyi │ ├── gradient_generator.py │ ├── memory.py │ ├── notebook.py │ ├── opaque_module.py │ ├── opaque_object.py │ ├── parallel │ │ ├── __init__.py │ │ └── pipelining │ │ │ ├── __init__.py │ │ │ ├── runtime.py │ │ │ ├── schedule_ir.py │ │ │ └── scheduler.py │ ├── proc_mesh.py │ ├── profiler.py │ ├── python_local_mesh.py │ ├── random.py │ ├── rdma.py │ ├── remote_class.py │ ├── rust_backend_mesh.py │ ├── rust_local_mesh.py │ ├── sim_mesh.py │ ├── simulator │ │ ├── README.md │ │ ├── __init__.py │ │ ├── command_history.py │ │ ├── config.py │ │ ├── interface.py │ │ ├── ir.py │ │ ├── mock_controller.py │ │ ├── profiling.py │ │ ├── simulator.py │ │ ├── task.py │ │ ├── tensor.py │ │ ├── trace.py │ │ ├── utils.py │ │ └── worker.py │ ├── tensor_worker_main.py │ ├── tensorboard.py │ ├── timer │ │ ├── README.md │ │ ├── __init__.py │ │ ├── example_monarch.py │ │ ├── example_spmd.py │ │ ├── execution_timer.py │ │ └── execution_timer_test.py │ ├── tools │ │ ├── __init__.py │ │ ├── cli.py │ │ ├── commands.py │ │ ├── components │ │ │ ├── __init__.py │ │ │ └── hyperactor.py │ │ ├── config │ │ │ ├── __init__.py │ │ │ └── defaults.py │ │ └── mesh_spec.py │ ├── worker │ │ ├── __init__.py │ │ ├── _testing_function.py │ │ ├── compiled_block.py │ │ ├── debugger.py │ │ ├── lines.py │ │ ├── monitor.py │ │ └── worker.py │ └── world_mesh.py ├── monarch_supervisor │ ├── README.md │ ├── __init__.py │ ├── _testing.py │ ├── diagram.png │ ├── function_call.py │ ├── host.py │ ├── launchers.py │ ├── log_pstree.py │ ├── logging.py │ ├── python_executable.py │ └── worker │ │ └── worker_env.py └── tests │ ├── __init__.py │ ├── _monarch │ ├── test_client.py │ ├── test_controller.py │ ├── test_hyperactor.py │ ├── test_ndslice.py │ └── test_worker.py │ ├── builtins │ ├── test_log.py │ └── test_random.py │ ├── dispatch_bench.py │ ├── dispatch_bench_helper.py │ ├── error_test_binary.py │ ├── requirements.txt │ ├── simulator │ ├── __init__.py │ ├── test_profiling.py │ ├── test_simulator.py │ ├── test_task.py │ └── test_worker.py │ ├── sleep_binary.py │ ├── test_actor_error.py │ ├── test_alloc.py │ ├── test_coalescing.py │ ├── test_controller.py │ ├── test_device_mesh.py │ ├── test_fault_tolerance.py │ ├── test_future.py │ ├── test_grad_generator.py │ ├── test_mock_cuda.py │ ├── test_pdb_actor.py │ ├── test_python_actors.py │ ├── test_remote_functions.py │ ├── test_rust_backend.py │ ├── test_signal_safe_block_on.py │ ├── test_sim_backend.py │ └── tools │ ├── config │ └── test_defaults.py │ ├── test_cli.py │ ├── test_commands.py │ ├── test_mesh_spec.py │ └── utils.py ├── requirements.txt ├── rust-toolchain ├── rustfmt.toml ├── setup.py ├── timed_test ├── Cargo.toml ├── src │ └── lib.rs └── tests │ └── basic.rs ├── tools └── rust │ └── ossconfigs │ └── clippy.toml └── torch-sys ├── Cargo.toml ├── README.md ├── build.rs ├── src ├── backend.rs ├── bindings.rs ├── borrow.rs ├── bridge.cpp ├── bridge.h ├── bridge.rs ├── call_op.rs ├── cell.rs ├── cuda.rs ├── device.rs ├── ivalue.rs ├── layout.rs ├── lib.rs ├── memory_format.rs ├── nccl.rs ├── pyobject.rs ├── rvalue.rs ├── scalar_type.rs ├── tensor.rs └── torch.hpp └── test_utils.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .github 3 | .pyre 4 | docs 5 | *_meta/** 6 | **/*_meta/** 7 | **/*_meta.rs 8 | **/meta/** 9 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 256 3 | extend-ignore = E302, G004, SIM105, G201, SIM115, SIM904 4 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Build monarch 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | - gh/** 11 | 12 | concurrency: 13 | group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} 14 | cancel-in-progress: true 15 | 16 | jobs: 17 | test: 18 | name: cuda12.6-py3.10-4xlarge 19 | strategy: 20 | fail-fast: true 21 | matrix: 22 | include: 23 | - name: 4xlarge 24 | runs-on: linux.g5.4xlarge.nvidia.gpu 25 | torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126' 26 | gpu-arch-type: "cuda" 27 | gpu-arch-version: "12.6" 28 | uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main 29 | with: 30 | timeout: 60 31 | runner: ${{ matrix.runs-on }} 32 | gpu-arch-type: ${{ matrix.gpu-arch-type }} 33 | gpu-arch-version: ${{ matrix.gpu-arch-version }} 34 | submodules: recursive 35 | script: | 36 | conda create -n venv python=3.10 -y 37 | conda activate venv 38 | export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH 39 | python -m pip install --upgrade pip 40 | 41 | # Install native dependencies 42 | dnf update -y 43 | dnf install clang-devel libunwind libunwind-devel -y 44 | 45 | # Install rust and setup nightly toolchain 46 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y 47 | source $HOME/.cargo/env 48 | rustup toolchain install nightly 49 | rustup default nightly 50 | 51 | # Install build dependencies 52 | pip install -r build-requirements.txt 53 | 54 | # Install test dependencies 55 | pip install -r python/tests/requirements.txt 56 | 57 | # Build and install monarch 58 | # NB: monarch is currently can't be built in isolated builds (e.g not PEP519 compatible) 59 | # because 'torch-sys' needs to be compiled against 'torch' in the main python environment 60 | # so that libtorch is linked correctly at runtime. 61 | pip install --no-build-isolation . 62 | 63 | # Run tests 64 | LC_ALL=C pytest python/tests/ -s -v -m "not oss_skip" 65 | python python/tests/test_mock_cuda.py 66 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | syntax: glob 2 | 3 | python/**/*.so 4 | python/**/*.json 5 | python/**/*.html 6 | python/**/*.pkl 7 | python/**/__pycache__ 8 | python/monarch.egg-info/* 9 | *.egg 10 | build/* 11 | dist/* 12 | monarch.egg-info/* 13 | python/monarch/monarch_controller 14 | 15 | .ipynb_checkpoints 16 | 17 | # Rust stuff 18 | target/ 19 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Meta Open Source Projects 2 | 3 | We want to make contributing to this project as easy and transparent as 4 | possible. 5 | 6 | ## Pull Requests 7 | We actively welcome your pull requests. 8 | 9 | Note: pull requests are not imported into the GitHub directory in the usual way. There is an internal Meta repository that is the "source of truth" for the project. The GitHub repository is generated *from* the internal Meta repository. So we don't merge GitHub PRs directly to the GitHub repository -- they must first be imported into internal Meta repository. When Meta employees look at the GitHub PR, there is a special button visible only to them that executes that import. The changes are then automatically reflected from the internal Meta repository back to GitHub. This is why you won't see your PR having being directly merged, but you still see your changes in the repository once it reflects the imported changes. 10 | 11 | 1. Fork the repo and create your branch from `main`. 12 | 2. If you've added code that should be tested, add tests. 13 | 3. If you've changed APIs, update the documentation. 14 | 4. Ensure the test suite passes. 15 | 5. Make sure your code lints. 16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 17 | 18 | ## Contributor License Agreement ("CLA") 19 | In order to accept your pull request, we need you to submit a CLA. You only need 20 | to do this once to work on any of Meta's open source projects. 21 | 22 | Complete your CLA here: 23 | 24 | ## Issues 25 | We use GitHub issues to track public bugs. Please ensure your description is 26 | clear and has sufficient instructions to be able to reproduce the issue. 27 | 28 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe 29 | disclosure of security bugs. In those cases, please go through the process 30 | outlined on that page and do not file a public issue. 31 | 32 | ## License 33 | By contributing to this project, you agree that your contributions will be licensed 34 | under the LICENSE file in the root directory of this source tree. 35 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | resolver = "2" 3 | members = [ 4 | "controller", 5 | "hyper", 6 | "hyperactor", 7 | "hyperactor_macros", 8 | "hyperactor_multiprocess", 9 | "hyperactor_mesh", 10 | "hyperactor_mesh_macros", 11 | "ndslice", 12 | "monarch_extension", 13 | "monarch_tensor_worker", 14 | "nccl-sys", 15 | "torch-sys", 16 | ] 17 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Pre-reqs: 2 | # 1. podman (shown below) or just docker 3 | # $ dnf install -y podman podman-docker 4 | # 2. NVIDIA container toolkit 5 | # $ dnf install -y nvidia-container-toolkit 6 | # 7 | # Build: 8 | # $ cd ~/monarch 9 | # $ export TAG_NAME=$USER-dev 10 | # $ docker build --network=host \ 11 | # -t monarch:$TAG_NAME \ 12 | # -f Dockerfile . 13 | # 14 | # Build (with http proxy): 15 | # $ docker build --network=host \ 16 | # --build-arg=http_proxy=$http_proxy \ 17 | # --build-arg=https_proxy=$https_proxy \ 18 | # -t monarch:$TAG_NAME \ 19 | # -f Dockerfile . 20 | # 21 | ARG http_proxy 22 | ARG https_proxy 23 | 24 | FROM pytorch/pytorch:2.7.0-cuda12.6-cudnn9-devel 25 | WORKDIR /monarch 26 | 27 | # export http proxy env vars if build-args are provided 28 | RUN if [ -n "${http_proxy}" ]; then export http_proxy=${http_proxy}; fi && \ 29 | if [ -n "${https_proxy}" ]; then export https_proxy=${https_proxy}; fi 30 | 31 | # Install native dependencies 32 | RUN apt-get update -y && \ 33 | apt-get -y install curl clang liblzma-dev libunwind-dev 34 | 35 | # Install Rust 36 | ENV PATH="/root/.cargo/bin:${PATH}" 37 | RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y 38 | 39 | # Install Python deps as a separate layer to avoid rebuilding if deps do not change 40 | COPY requirements.txt . 41 | RUN pip install --no-cache-dir -r requirements.txt 42 | 43 | # Install monarch 44 | COPY . . 45 | RUN cargo install --path monarch_hyperactor 46 | RUN pip install . 47 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) Meta Platforms, Inc. and affiliates. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Monarch 🦋 2 | 3 | **Monarch** is a distributed execution engine for PyTorch. Our overall goal is 4 | to deliver the high-quality user experience that people get from single-GPU 5 | PyTorch, but at cluster scale. 6 | 7 | > ⚠️ **Early Development Warning** Monarch is currently in an experimental 8 | > stage. You should expect bugs, incomplete features, and APIs that may change 9 | > in future versions. The project welcomes bugfixes, but to make sure things are 10 | > well coordinated you should discuss any significant change before starting the 11 | > work. It's recommended that you signal your intention to contribute in the 12 | > issue tracker, either by filing a new issue or by claiming an existing one. 13 | 14 | Note: Monarch is currently only supported on Linux systems 15 | 16 | ## Installation 17 | 18 | `pip install torchmonarch` 19 | 20 | or manually 21 | 22 | ```sh 23 | 24 | # Create and activate the conda environment 25 | conda create -n monarchenv python=3.10 -y 26 | conda activate monarchenv 27 | 28 | # Install nightly rust toolchain 29 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh 30 | rustup toolchain install nightly 31 | rustup default nightly 32 | 33 | # Install non-python dependencies 34 | conda install libunwind -y 35 | 36 | # Install the correct cuda and cuda-toolkit versions for your machine 37 | sudo dnf install cuda-toolkit-12-0 cuda-12-0 38 | 39 | # Install clang-dev and nccl-dev 40 | sudo dnf install clang-devel libnccl-devel 41 | # Or, in some envrionments, the following may be necessary instead 42 | conda install -c conda-forge clangdev nccl 43 | conda update -n monarchenv --all -c conda-forge -y 44 | 45 | # Install build dependencies 46 | pip install -r build-requirements.txt 47 | # Install test dependencies 48 | pip install -r python/tests/requirements.txt 49 | 50 | # Build and install Monarch 51 | pip install --no-build-isolation . 52 | # or setup for development 53 | pip install --no-build-isolation -e . 54 | 55 | # Run unit tests. consider -s for more verbose output 56 | pytest python/tests/ -v -m "not oss_skip" 57 | ``` 58 | 59 | ## Running examples 60 | 61 | Check out the `examples/` directory for demonstrations of how to use Monarch's APIs. 62 | 63 | We'll be adding more examples as we stabilize and polish functionality! 64 | 65 | ## License 66 | 67 | Monarch is BSD-3 licensed, as found in the [LICENSE](LICENSE) file. 68 | -------------------------------------------------------------------------------- /build-requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | setuptools 3 | setuptools-rust 4 | wheel 5 | numpy 6 | -------------------------------------------------------------------------------- /clippy.toml: -------------------------------------------------------------------------------- 1 | too-many-lines-threshold = 200 2 | await-holding-invalid-types = [ 3 | { path = "tracing::span::Entered", reason = "`Entered` is not aware when a function is suspended: https://docs.rs/tracing/latest/tracing/struct.Span.html#in-asynchronous-code" }, 4 | { path = "tracing::span::EnteredSpan", reason = "`EnteredSpan` is not aware when a function is suspended: https://docs.rs/tracing/latest/tracing/struct.Span.html#in-asynchronous-code" }, 5 | ] 6 | -------------------------------------------------------------------------------- /controller/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/controller:[controller,controller-bin] 2 | 3 | [package] 4 | name = "controller" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [[bin]] 11 | name = "controller_bin" 12 | path = "src/main.rs" 13 | 14 | [dependencies] 15 | anyhow = "1.0.95" 16 | async-trait = "0.1.86" 17 | bincode = "1.3.3" 18 | clap = { version = "4.5.38", features = ["derive", "env", "string", "unicode", "wrap_help"] } 19 | const_format = "0.2" 20 | hyperactor = { version = "0.0.0", path = "../hyperactor" } 21 | hyperactor_mesh = { version = "0.0.0", path = "../hyperactor_mesh" } 22 | hyperactor_multiprocess = { version = "0.0.0", path = "../hyperactor_multiprocess" } 23 | monarch_messages = { version = "0.0.0", path = "../monarch_messages" } 24 | nccl-sys = { path = "../nccl-sys" } 25 | ndslice = { version = "0.0.0", path = "../ndslice" } 26 | pyo3 = { version = "0.22.6", features = ["anyhow"] } 27 | serde = { version = "1.0.185", features = ["derive", "rc"] } 28 | serde_json = { version = "1.0.140", features = ["float_roundtrip", "unbounded_depth"] } 29 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 30 | torch-sys = { path = "../torch-sys" } 31 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 32 | 33 | [dev-dependencies] 34 | monarch_types = { version = "0.0.0", path = "../monarch_types" } 35 | torch-sys = { version = "0.0.0", path = "../torch-sys" } 36 | -------------------------------------------------------------------------------- /controller/build.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // This is needed due to the controller being built with torch/nccl deps due to monarch_messages. 10 | 11 | fn main() { 12 | // `torch-sys` will set this env var through Cargo `links` metadata. 13 | let lib_path = std::env::var("DEP_TORCH_LIB_PATH").expect("DEP_TORCH_LIB_PATH to be set"); 14 | // Set the rpath so that the dynamic linker can find libtorch and friends. 15 | println!("cargo::rustc-link-arg=-Wl,-rpath,{lib_path}"); 16 | 17 | if let Ok(path) = std::env::var("DEP_NCCL_LIB_PATH") { 18 | println!("cargo::rustc-link-arg=-Wl,-rpath,{path}"); 19 | } 20 | 21 | // Disable new dtags, as conda envs generally use `RPATH` over `RUNPATH`. 22 | println!("cargo::rustc-link-arg=-Wl,--disable-new-dtags"); 23 | 24 | println!("cargo:rustc-link-lib=lzma"); 25 | } 26 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/monarch/2876b0c5f131c092549d68c645d2ba37dfde857a/examples/__init__.py -------------------------------------------------------------------------------- /examples/notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Example Notebooks 2 | 3 | This folder contains some basic examples of using the Monarch API in jupyter notebooks. 4 | 5 | ## Setup 6 | 1. Follow the instructions outlined in ../../monarch/README.md to setup Monarch 7 | 2. Pip install jupyter: 8 | `pip install jupyter notebook` 9 | 3. Run your jupyter notebook: `jupyter notebook` 10 | 4. (optiona) In remote settings (as in a devserver), you can also port forward your jupyter notebook to your local machine. e.g. 11 | ``` 12 | # devserver 13 | jupyter notebook --no-browser --port=8098 14 | 15 | #local 16 | ssh -N -L 8098:localhost:8098 17 | ```` 18 | 5. Open localhost:8098 in your browser to see the jupyter notebook 19 | 20 | 21 | ## Manifest 22 | * ping_pong.ipynb - Simple hello world with Actor API + Inter Actor Communication 23 | -------------------------------------------------------------------------------- /hyper/Cargo.toml: -------------------------------------------------------------------------------- 1 | # This file is manually maintained to maintain the abilith to build hyper 2 | # using cargo. The code is annotated with fbcode_build conditionals such that 3 | # it works with both cargo (all oss deps) and buck (full meta deps). 4 | [package] 5 | name = "hyper" 6 | version = "0.0.0" 7 | authors = ["Facebook"] 8 | edition = "2021" 9 | license = "MIT" 10 | 11 | [dependencies] 12 | anyhow = "1.0.95" 13 | async-trait = "0.1.86" 14 | clap = { version = "4.5.30", features = ["derive", "env", "string", "unicode", "wrap_help"] } 15 | hyperactor = { path = "../hyperactor" } 16 | hyperactor_multiprocess = { path = "../hyperactor_multiprocess" } 17 | serde = { version = "1.0.185", features = ["derive", "rc"] } 18 | serde_json = { version = "1.0.132", features = ["float_roundtrip", "unbounded_depth"] } 19 | tabwriter = { version = "1.2.1", features = ["ansi_formatting"] } 20 | tokio = { version = "1.41.0", features = ["full", "test-util", "tracing"] } 21 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 22 | -------------------------------------------------------------------------------- /hyper/hyper.dotslash.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env fbpython 2 | # Copyright (c) Meta Platforms, Inc. and affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | 8 | import dotslash 9 | 10 | dotslash.export_fbcode_build( 11 | target="fbcode//monarch/hyper:hyper", 12 | oncall="monarch", 13 | ) 14 | -------------------------------------------------------------------------------- /hyper/src/commands.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | pub mod demo; 10 | pub mod procs; 11 | pub mod serve; 12 | pub mod show; 13 | #[cfg(fbcode_build)] 14 | pub mod top; 15 | -------------------------------------------------------------------------------- /hyper/src/commands/serve.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use std::time::Duration; 10 | 11 | use hyperactor::channel::ChannelAddr; 12 | use hyperactor::channel::ChannelTransport; 13 | use hyperactor_multiprocess::system::System; 14 | 15 | // The commands in the demo spawn temporary actors the join a system. 16 | // Set a long heartbeat duration so we do not check heartbeats for these actors. 17 | // [`Duration::from_secs`] is a stable API. Any APIs with units bigger than secs are unstable. 18 | static LONG_DURATION: Duration = Duration::from_secs(500000); 19 | 20 | #[derive(clap::Args, Debug)] 21 | pub struct ServeCommand { 22 | /// The address to serve the system actor on. If not specified, the local 23 | /// host will be used. 24 | #[arg(short, long)] 25 | addr: Option, 26 | } 27 | 28 | impl ServeCommand { 29 | pub async fn run(self) -> anyhow::Result<()> { 30 | let addr = self.addr.unwrap_or(ChannelAddr::any(ChannelTransport::Tcp)); 31 | let handle = System::serve(addr, LONG_DURATION, LONG_DURATION).await?; 32 | eprintln!("serve: {}", handle.local_addr()); 33 | handle.await; 34 | Ok(()) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /hyper/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | pub mod tui; 10 | pub mod utils; 11 | -------------------------------------------------------------------------------- /hyper/src/main.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | mod commands; 10 | 11 | use clap::Parser; 12 | use clap::Subcommand; 13 | 14 | use crate::commands::demo::DemoCommand; 15 | use crate::commands::procs::ProcsCommand; 16 | use crate::commands::serve::ServeCommand; 17 | use crate::commands::show::ShowCommand; 18 | #[cfg(fbcode_build)] 19 | use crate::commands::top::TopCommand; 20 | 21 | #[derive(Parser)] 22 | #[command()] 23 | struct Cli { 24 | #[command(subcommand)] 25 | command: Command, 26 | } 27 | 28 | #[derive(Subcommand)] 29 | enum Command { 30 | /// Spawns and serve a system actor. 31 | Serve(ServeCommand), 32 | #[command(subcommand)] 33 | /// Demo some basic concepts of multiprocess hyperactor. Before using these 34 | /// commands, use `serve` to start a system actor, and get the system 35 | /// address from the output. 36 | Demo(DemoCommand), 37 | #[clap(about = r#"Show the state of a reference. For example: 38 | - System: show 39 | - World: show world 40 | - Gang: show world.gang 41 | - Proc: show world[2] 42 | - Actor: show world[3].actor[1]"#)] 43 | Show(ShowCommand), 44 | #[clap(about = "Show details about processes running in worlds.")] 45 | #[command(subcommand)] 46 | Procs(ProcsCommand), 47 | #[cfg(fbcode_build)] 48 | #[clap(about = "Show a dynamic real-time view of the system")] 49 | Top(TopCommand), 50 | } 51 | 52 | #[cfg(fbcode_build)] 53 | #[fbinit::main] 54 | async fn main(_: fbinit::FacebookInit) -> Result<(), anyhow::Error> { 55 | run().await 56 | } 57 | 58 | #[cfg(not(fbcode_build))] 59 | #[tokio::main] 60 | async fn main() -> Result<(), anyhow::Error> { 61 | run().await 62 | } 63 | 64 | async fn run() -> Result<(), anyhow::Error> { 65 | let args = Cli::parse(); 66 | hyperactor::initialize(); 67 | 68 | match args.command { 69 | Command::Serve(command) => Ok(command.run().await?), 70 | Command::Demo(command) => Ok(command.run().await?), 71 | Command::Show(command) => Ok(command.run().await?), 72 | Command::Procs(command) => Ok(command.run().await?), 73 | Command::Top(command) => Ok(command.run().await?), 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /hyper/src/tui/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | pub mod top; 10 | -------------------------------------------------------------------------------- /hyper/src/utils/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | pub mod system_address; 10 | -------------------------------------------------------------------------------- /hyperactor/src/cap.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | //! Capabilities used in various public APIs. 10 | 11 | /// CanSend is a capabilty to confers the right of the holder to send 12 | /// messages to actors. CanSend is sealed and may only be implemented 13 | /// and accessed by this crate. 14 | pub trait CanSend: sealed::CanSend {} 15 | impl CanSend for T {} 16 | 17 | /// CanOpenPort is a capability that confers the ability of hte holder to 18 | /// open local ports, which can then be used to receive messages. 19 | pub trait CanOpenPort: sealed::CanOpenPort {} 20 | impl CanOpenPort for T {} 21 | 22 | /// CanOpenPort is a capability that confers the ability of the holder to 23 | /// split ports. 24 | pub trait CanSplitPort: sealed::CanSplitPort {} 25 | impl CanSplitPort for T {} 26 | 27 | /// CanSpawn is a capability that confers the ability to spawn a child 28 | /// actor. 29 | pub trait CanSpawn: sealed::CanSpawn {} 30 | impl CanSpawn for T {} 31 | 32 | pub(crate) mod sealed { 33 | use async_trait::async_trait; 34 | 35 | use crate::PortId; 36 | use crate::actor::Actor; 37 | use crate::actor::ActorHandle; 38 | use crate::data::Serialized; 39 | use crate::mailbox::Mailbox; 40 | 41 | pub trait CanSend: Send + Sync { 42 | fn post(&self, dest: PortId, data: Serialized); 43 | } 44 | 45 | pub trait CanOpenPort: Send + Sync { 46 | fn mailbox(&self) -> &Mailbox; 47 | } 48 | 49 | pub trait CanSplitPort: Send + Sync { 50 | fn split(&self, port_id: PortId, reducer: Option) -> PortId; 51 | } 52 | 53 | #[async_trait] 54 | pub trait CanSpawn: Send + Sync { 55 | async fn spawn(&self, params: A::Params) -> anyhow::Result>; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /hyperactor/src/checkpoint.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | //! Checkpoint functionality for various objects to save and load states. 10 | 11 | use std::fmt::Debug; 12 | 13 | use async_trait::async_trait; 14 | 15 | use crate::RemoteMessage; 16 | use crate::mailbox::log::SeqId; 17 | 18 | /// Errors that occur during checkpoint operations. 19 | /// This enum is marked non-exhaustive to allow for extensibility. 20 | #[derive(thiserror::Error, Debug)] 21 | #[non_exhaustive] 22 | pub enum CheckpointError { 23 | /// An error occured during saving checkpoints. 24 | #[error("save")] 25 | Save(#[source] anyhow::Error), 26 | 27 | /// An error occured during loading checkpoints. 28 | #[error("load: {0}")] 29 | Load(SeqId, #[source] anyhow::Error), 30 | } 31 | 32 | /// [`Checkpoint`] is used to save the state of an instance so that it can be restored later. 33 | #[async_trait] 34 | pub trait Checkpointable: Send + Sync + Sized { 35 | /// The type of the state that is saved. The state can be serialized and deserialized 36 | /// from persistent storage. 37 | type State: RemoteMessage; 38 | 39 | /// Saves the current state. 40 | async fn save(&self) -> Result; 41 | 42 | /// Loads the a state to restore the instance. 43 | async fn load(state: Self::State) -> Result; 44 | } 45 | -------------------------------------------------------------------------------- /hyperactor/src/mailbox/mailbox_admin_message.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use serde::Deserialize; 10 | use serde::Serialize; 11 | 12 | pub use crate as hyperactor; 13 | use crate::HandleClient; 14 | use crate::Handler; 15 | use crate::Named; 16 | use crate::ProcId; 17 | use crate::RefClient; 18 | use crate::mailbox::ChannelAddr; 19 | 20 | /// Messages relating to mailbox administration. 21 | #[derive( 22 | Handler, 23 | HandleClient, 24 | RefClient, 25 | Debug, 26 | Serialize, 27 | Deserialize, 28 | Clone, 29 | PartialEq, 30 | Named 31 | )] 32 | pub enum MailboxAdminMessage { 33 | /// An address update. 34 | UpdateAddress { 35 | /// The ID of the proc. 36 | proc_id: ProcId, 37 | 38 | /// The address at which it listens. 39 | addr: ChannelAddr, 40 | }, 41 | } 42 | -------------------------------------------------------------------------------- /hyperactor/src/metrics.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | //! A bunch of statily defined metrics. Defined here because they are used in 10 | //! both macros and handwritten code. 11 | 12 | use hyperactor_telemetry::declare_static_counter; 13 | use hyperactor_telemetry::declare_static_timer; 14 | use hyperactor_telemetry::declare_static_up_down_counter; 15 | 16 | declare_static_counter!(MESSAGES_SENT, "messages_sent"); 17 | declare_static_counter!(MESSAGES_RECEIVED, "messages_received"); 18 | declare_static_counter!(MESSAGE_HANDLE_ERRORS, "message_handle_errors"); 19 | declare_static_counter!(MESSAGE_RECEIVE_ERRORS, "message_receive_errors"); 20 | declare_static_up_down_counter!(MESSAGE_QUEUE_SIZE, "message_queue_size"); 21 | declare_static_timer!( 22 | MESSAGE_HANDLER_DURATION, 23 | "message_handler_duration", 24 | hyperactor_telemetry::TimeUnit::Nanos 25 | ); 26 | 27 | declare_static_timer!( 28 | ACTOR_STATUS, 29 | "actor.status", 30 | hyperactor_telemetry::TimeUnit::Nanos 31 | ); 32 | -------------------------------------------------------------------------------- /hyperactor/src/spawn.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use std::sync::Arc; 10 | use std::sync::atomic::AtomicU64; 11 | use std::sync::atomic::Ordering; 12 | 13 | use async_trait::async_trait; 14 | 15 | use crate::actor::Actor; 16 | use crate::actor::ActorHandle; 17 | use crate::cap::sealed::CanSpawn; 18 | use crate::mailbox::BoxedMailboxSender; 19 | use crate::reference::ActorId; 20 | #[derive(Debug)] 21 | struct LocalSpawnerState { 22 | root: ActorId, 23 | sender: BoxedMailboxSender, 24 | next_pid: AtomicU64, 25 | } 26 | 27 | #[derive(Clone, Debug)] 28 | pub(crate) struct LocalSpawner(Option>); 29 | 30 | impl LocalSpawner { 31 | pub(crate) fn new(root: ActorId, sender: BoxedMailboxSender) -> Self { 32 | Self(Some(Arc::new(LocalSpawnerState { 33 | root, 34 | sender, 35 | next_pid: AtomicU64::new(1), 36 | }))) 37 | } 38 | 39 | pub(crate) fn new_panicking() -> Self { 40 | Self(None) 41 | } 42 | } 43 | 44 | #[async_trait] 45 | impl CanSpawn for LocalSpawner { 46 | async fn spawn(&self, params: A::Params) -> ActorHandle { 47 | let state = self.0.as_ref().expect("invalid spawner"); 48 | let pid = state.next_pid.fetch_add(1, Ordering::Relaxed); 49 | let actor_id = state.root.child_id(pid); 50 | A::do_spawn(state.sender.clone(), actor_id, params, self.clone()) 51 | .await 52 | .unwrap() 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /hyperactor/src/supervision.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | //! Messages used in supervision. 10 | 11 | use std::fmt::Debug; 12 | 13 | use serde::Deserialize; 14 | use serde::Serialize; 15 | 16 | use crate as hyperactor; // for macros 17 | use crate::Named; 18 | use crate::actor::ActorStatus; 19 | use crate::reference::ActorId; 20 | 21 | /// This is the local actor supervision event. Child actor will propagate this event to its parent. 22 | #[derive(Clone, Debug, Serialize, Deserialize, Named, PartialEq, Eq)] 23 | pub struct ActorSupervisionEvent { 24 | /// The actor id of the child actor where the event is triggered. 25 | actor_id: ActorId, 26 | /// Status of the child actor. 27 | actor_status: ActorStatus, 28 | } 29 | 30 | impl ActorSupervisionEvent { 31 | /// Create a new actor supervision event. 32 | pub fn new(actor_id: ActorId, actor_status: ActorStatus) -> Self { 33 | Self { 34 | actor_id, 35 | actor_status, 36 | } 37 | } 38 | /// Get the actor id of the supervision event. 39 | pub fn actor_id(&self) -> &ActorId { 40 | &self.actor_id 41 | } 42 | /// Get the actor status of the supervision event. 43 | pub fn actor_status(&self) -> &ActorStatus { 44 | &self.actor_status 45 | } 46 | 47 | /// Consume this event to a tuple. 48 | pub fn into_inner(self) -> (ActorId, ActorStatus) { 49 | (self.actor_id, self.actor_status) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /hyperactor/src/sync/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | //! Synchronization primitives that are used by Hyperactor. 10 | //! 11 | //! These are used in related Hyperactor crates as well, and are thus part of the 12 | //! public API. However, they should not be considered a stable part of the Hyperactor 13 | //! API itself, and they may be moved to a different crate in the future. 14 | 15 | pub mod flag; 16 | pub mod monitor; 17 | -------------------------------------------------------------------------------- /hyperactor/src/test_utils/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | /// PingPongActor test util. 10 | pub mod pingpong; 11 | /// ProcSupervisionCoordinator test util. 12 | pub mod proc_supervison; 13 | /// Used to verify behaviors related to process. 14 | pub mod process_assertion; 15 | /// Used for using tracing in tests. 16 | pub mod tracing; 17 | -------------------------------------------------------------------------------- /hyperactor/src/test_utils/process_assertion.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use std::future::Future; 10 | 11 | use nix::sys::wait::WaitStatus; 12 | use nix::sys::wait::waitpid; 13 | use nix::unistd::ForkResult; 14 | use nix::unistd::fork; 15 | 16 | /// Fork a child process, execute the given function in that process, and verify 17 | /// that the process exits with the given exit code. 18 | pub async fn assert_termination(f: F, expected_code: i32) -> anyhow::Result<()> 19 | where 20 | F: FnOnce() -> Fut, 21 | Fut: Future, 22 | { 23 | // SAFETY: for unit test process assertion. 24 | unsafe { 25 | match fork() { 26 | Ok(ForkResult::Parent { child, .. }) => match waitpid(child, None)? { 27 | WaitStatus::Exited(_, exit_code) => { 28 | anyhow::ensure!(exit_code == expected_code); 29 | Ok(()) 30 | } 31 | status => Err(anyhow::anyhow!( 32 | "didn't receive expected status. got: {:?}", 33 | status 34 | )), 35 | }, 36 | Ok(ForkResult::Child) => Ok(f().await), 37 | Err(_) => Err(anyhow::anyhow!("fork failed")), 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /hyperactor/src/test_utils/tracing.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use tracing::Level; 10 | 11 | /// Set up a tracing subscriber with a filter, so we can print tracing logs 12 | /// with >= level by using *buck run*. 13 | /// 14 | /// Note this function does not work with *buck test*. 15 | // 16 | /// This is better than the traced_test macro when logs_contain and logs_assert 17 | /// are not needed, because that macro prints TRACE level logs, which is too 18 | /// verbose. 19 | pub fn set_tracing_env_filter(level: Level) { 20 | let subscriber = tracing_subscriber::fmt() 21 | .with_env_filter(tracing_subscriber::EnvFilter::new(level.as_str())) 22 | .finish(); 23 | tracing::subscriber::set_global_default(subscriber).expect("Failed to set subscriber"); 24 | } 25 | -------------------------------------------------------------------------------- /hyperactor_extension/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/hyperactor_extension:hyperactor_extension 2 | 3 | [package] 4 | name = "hyperactor_extension" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [dependencies] 11 | async-trait = "0.1.86" 12 | hyperactor = { version = "0.0.0", path = "../hyperactor" } 13 | hyperactor_mesh = { version = "0.0.0", path = "../hyperactor_mesh" } 14 | ndslice = { version = "0.0.0", path = "../ndslice" } 15 | pyo3 = { version = "0.22.6", features = ["anyhow"] } 16 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 17 | -------------------------------------------------------------------------------- /hyperactor_extension/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #![allow(unsafe_op_in_unsafe_fn)] 10 | 11 | pub mod alloc; 12 | pub mod telemetry; 13 | -------------------------------------------------------------------------------- /hyperactor_extension/src/telemetry.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #![allow(unsafe_op_in_unsafe_fn)] 10 | 11 | use pyo3::prelude::*; 12 | 13 | /// Log a message with the given metadata 14 | #[pyfunction] 15 | pub fn forward_to_tracing(message: &str, file: &str, lineno: i64, level: i32) { 16 | // Map level number to level name 17 | match level { 18 | 40 => tracing::error!(file = file, lineno = lineno, message), 19 | 30 => tracing::warn!(file = file, lineno = lineno, message), 20 | 20 => tracing::info!(file = file, lineno = lineno, message), 21 | 10 => tracing::debug!(file = file, lineno = lineno, message), 22 | _ => tracing::info!(file = file, lineno = lineno, message), 23 | } 24 | } 25 | 26 | use pyo3::Bound; 27 | use pyo3::types::PyModule; 28 | 29 | pub fn register_python_bindings(module: &Bound<'_, PyModule>) -> PyResult<()> { 30 | let f = wrap_pyfunction!(forward_to_tracing, module)?; 31 | f.setattr( 32 | "__module__", 33 | "monarch._rust_bindings.hyperactor_extension.telemetry", 34 | )?; 35 | module.add_function(f)?; 36 | Ok(()) 37 | } 38 | -------------------------------------------------------------------------------- /hyperactor_macros/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/hyperactor_macros:hyperactor_macros 2 | 3 | [package] 4 | name = "hyperactor_macros" 5 | version = "0.0.0" 6 | authors = ["Facebook "] 7 | edition = "2021" 8 | description = "macros to support the Hyperactor actors and data exchange" 9 | repository = "https://github.com/pytorch-labs/monarch/" 10 | license = "BSD-3-Clause" 11 | 12 | [lib] 13 | test = false 14 | doctest = false 15 | proc-macro = true 16 | 17 | [dependencies] 18 | convert_case = "0.6" 19 | indoc = "2.0.2" 20 | proc-macro2 = { version = "1.0.70", features = ["span-locations"] } 21 | quote = "1.0.29" 22 | syn = { version = "2.0.101", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } 23 | 24 | [dev-dependencies] 25 | hyperactor = { path = "../hyperactor" } 26 | serde = { version = "1.0.185", features = ["derive", "rc"] } 27 | tokio = { version = "1.37.0", features = ["full", "test-util", "tracing"] } 28 | -------------------------------------------------------------------------------- /hyperactor_macros/build.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | fn main() { 10 | println!("cargo::rustc-check-cfg=cfg(enable_hyperactor_message_logging)"); 11 | } 12 | -------------------------------------------------------------------------------- /hyperactor_mesh/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/hyperactor_mesh:[hyperactor_mesh,hyperactor_mesh_example_dining_philosophers,hyperactor_mesh_test_bootstrap] 2 | 3 | [package] 4 | name = "hyperactor_mesh" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [[bin]] 11 | name = "hyperactor_mesh_example_dining_philosophers" 12 | path = "examples/dining_philosophers.rs" 13 | 14 | [[bin]] 15 | name = "hyperactor_mesh_test_bootstrap" 16 | path = "test/bootstrap.rs" 17 | 18 | [dependencies] 19 | anyhow = "1.0.95" 20 | async-trait = "0.1.86" 21 | bincode = "1.3.3" 22 | bitmaps = "3.2.1" 23 | enum-as-inner = "0.6.0" 24 | futures = { version = "0.3.30", features = ["async-await", "compat"] } 25 | hyperactor = { version = "0.0.0", path = "../hyperactor" } 26 | hyperactor_mesh_macros = { version = "0.0.0", path = "../hyperactor_mesh_macros" } 27 | hyperactor_telemetry = { version = "0.0.0", path = "../hyperactor_telemetry" } 28 | mockall = "0.13.1" 29 | ndslice = { version = "0.0.0", path = "../ndslice" } 30 | rand = { version = "0.8", features = ["small_rng"] } 31 | serde = { version = "1.0.185", features = ["derive", "rc"] } 32 | thiserror = "2.0.12" 33 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 34 | tokio-stream = { version = "0.1.17", features = ["fs", "io-util", "net", "signal", "sync", "time"] } 35 | tokio-util = { version = "0.7.15", features = ["full"] } 36 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 37 | 38 | [dev-dependencies] 39 | buck-resources = "1" 40 | maplit = "1.0" 41 | timed_test = { version = "0.0.0", path = "../timed_test" } 42 | tracing-test = { version = "0.2.3", features = ["no-env-filter"] } 43 | -------------------------------------------------------------------------------- /hyperactor_mesh/src/mesh.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use async_trait::async_trait; 10 | use ndslice::Range; 11 | use ndslice::Shape; 12 | use ndslice::ShapeError; 13 | use ndslice::SliceIterator; 14 | 15 | /// A mesh of nodes, organized into the topology described by its shape (see [`Shape`]). 16 | #[async_trait] 17 | pub trait Mesh { 18 | /// The type of the node contained in the mesh. 19 | type Node; 20 | 21 | /// The type of a slice of this mesh. Slices should not outlive their 22 | /// parent mesh. 23 | type Sliced<'a>: Mesh + 'a 24 | where 25 | Self: 'a; 26 | 27 | /// The shape of this mesh. 28 | fn shape(&self) -> &Shape; 29 | 30 | /// Sub-slice this mesh, specifying the included ranges for 31 | /// the dimension with the labeled name. 32 | fn select>(&self, label: &str, range: R) 33 | -> Result, ShapeError>; 34 | 35 | /// Retrieve contained node at the provided index. The index is 36 | /// relative to the shape of the mesh. 37 | fn get(&self, index: usize) -> Option; 38 | 39 | /// Iterate over all the nodes in this mesh. 40 | fn iter(&self) -> MeshIter<'_, Self> { 41 | MeshIter { 42 | mesh: self, 43 | slice_iter: self.shape().slice().iter(), 44 | } 45 | } 46 | } 47 | 48 | /// An iterator over the nodes of a mesh. 49 | pub struct MeshIter<'a, M: Mesh + ?Sized> { 50 | mesh: &'a M, 51 | slice_iter: SliceIterator<'a>, 52 | } 53 | 54 | impl Iterator for MeshIter<'_, M> { 55 | type Item = M::Node; 56 | 57 | fn next(&mut self) -> Option { 58 | self.slice_iter 59 | .next() 60 | .map(|index| self.mesh.get(index).unwrap()) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /hyperactor_mesh/src/metrics.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use hyperactor_telemetry::*; 10 | 11 | declare_static_timer!( 12 | ACTOR_MESH_CAST_DURATION, 13 | "actor_mesh_cast_duration", 14 | TimeUnit::Micros 15 | ); 16 | -------------------------------------------------------------------------------- /hyperactor_mesh/src/test_utils.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use async_trait::async_trait; 10 | use hyperactor::Actor; 11 | use hyperactor::Handler; 12 | use hyperactor::Instance; 13 | use hyperactor::Named; 14 | use hyperactor::message::Bind; 15 | use hyperactor::message::Bindings; 16 | use hyperactor::message::IndexedErasedUnbound; 17 | use hyperactor::message::Unbind; 18 | use serde::Deserialize; 19 | use serde::Serialize; 20 | 21 | use crate::actor_mesh::Cast; 22 | 23 | /// Message that can be sent to an EmptyActor. 24 | #[derive(Serialize, Deserialize, Debug, Named, Clone)] 25 | pub struct EmptyMessage(); 26 | 27 | // TODO(pzhang) replace the boilerplate Bind/Unbind impls with a macro. 28 | impl Bind for EmptyMessage { 29 | fn bind(self, _bindings: &Bindings) -> anyhow::Result { 30 | Ok(self) 31 | } 32 | } 33 | 34 | impl Unbind for EmptyMessage { 35 | fn bindings(&self) -> anyhow::Result { 36 | Ok(Bindings::default()) 37 | } 38 | } 39 | 40 | /// No-op actor. 41 | #[derive(Debug, PartialEq)] 42 | #[hyperactor::export( 43 | EmptyMessage, 44 | Cast, IndexedErasedUnbound> 45 | )] 46 | pub struct EmptyActor(); 47 | 48 | #[async_trait] 49 | impl Actor for EmptyActor { 50 | type Params = (); 51 | 52 | async fn new(_: ()) -> Result { 53 | Ok(Self()) 54 | } 55 | } 56 | 57 | #[async_trait] 58 | impl Handler for EmptyActor { 59 | async fn handle(&mut self, _: &Instance, _: EmptyMessage) -> Result<(), anyhow::Error> { 60 | Ok(()) 61 | } 62 | } 63 | 64 | #[async_trait] 65 | impl Handler> for EmptyActor { 66 | async fn handle( 67 | &mut self, 68 | _: &Instance, 69 | _: Cast, 70 | ) -> Result<(), anyhow::Error> { 71 | Ok(()) 72 | } 73 | } 74 | hyperactor::remote!(EmptyActor); 75 | -------------------------------------------------------------------------------- /hyperactor_mesh/test/bootstrap.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | /// This is an "empty shell" bootstrap process, 10 | /// simply invoking [`hyperactor_mesh::bootstrap_or_die`]. 11 | #[tokio::main] 12 | async fn main() { 13 | hyperactor_mesh::bootstrap_or_die().await; 14 | } 15 | -------------------------------------------------------------------------------- /hyperactor_mesh_macros/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/hyperactor_mesh_macros:hyperactor_mesh_macros 2 | 3 | [package] 4 | name = "hyperactor_mesh_macros" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [lib] 11 | test = false 12 | doctest = false 13 | proc-macro = true 14 | 15 | [dependencies] 16 | ndslice = { version = "0.0.0", path = "../ndslice" } 17 | proc-macro2 = { version = "1.0.70", features = ["span-locations"] } 18 | quote = "1.0.29" 19 | -------------------------------------------------------------------------------- /hyperactor_mesh_macros/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // Clippy can't see through quote! to use of proc-macro2 10 | #![allow(unused_crate_dependencies)] 11 | 12 | extern crate proc_macro; 13 | 14 | use proc_macro::TokenStream; 15 | use quote::quote; 16 | 17 | /// Parse a compact selection expression into a [`Selection`]. See 18 | /// [`selection::parse`] for syntax documentation. 19 | #[proc_macro] 20 | pub fn sel(input: TokenStream) -> TokenStream { 21 | match ndslice::selection::token_parser::parse_tokens(input.into()) { 22 | Ok(selection) => { 23 | let tokens = ndslice::selection::token_parser::selection_to_tokens(&selection); 24 | quote!(#tokens).into() 25 | } 26 | Err(e) => { 27 | let msg = format!("sel! parse failed: {}", e); 28 | quote!(compile_error!(#msg)).into() 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /hyperactor_multiprocess/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/hyperactor_multiprocess:hyperactor_multiprocess 2 | 3 | [package] 4 | name = "hyperactor_multiprocess" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [dependencies] 11 | anyhow = "1.0.95" 12 | async-trait = "0.1.86" 13 | bincode = "1.3.3" 14 | dashmap = { version = "5.5.3", features = ["rayon", "serde"] } 15 | enum-as-inner = "0.6.0" 16 | futures = { version = "0.3.30", features = ["async-await", "compat"] } 17 | hyperactor = { version = "0.0.0", path = "../hyperactor" } 18 | hyperactor_mesh = { version = "0.0.0", path = "../hyperactor_mesh" } 19 | hyperactor_telemetry = { version = "0.0.0", path = "../hyperactor_telemetry" } 20 | py-spy = { git = "https://github.com/technicianted/py-spy", rev = "8f74f3e4f955fee57f0d4a8103511ee788348a2a", features = ["unwind"] } 21 | remoteprocess = { git = "https://github.com/technicianted/remoteprocess", rev = "72505594a19d80c07df6f1dc4a80556b7e462148" } 22 | serde = { version = "1.0.185", features = ["derive", "rc"] } 23 | thiserror = "2.0.12" 24 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 25 | tokio-retry = "0.3" 26 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 27 | 28 | [dev-dependencies] 29 | maplit = "1.0" 30 | rand = { version = "0.8", features = ["small_rng"] } 31 | regex = "1.11.1" 32 | serde_json = { version = "1.0.140", features = ["float_roundtrip", "unbounded_depth"] } 33 | timed_test = { version = "0.0.0", path = "../timed_test" } 34 | tracing-test = { version = "0.2.3", features = ["no-env-filter"] } 35 | -------------------------------------------------------------------------------- /hyperactor_multiprocess/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | //! Multiprocess actor system and support. 10 | 11 | #![feature(assert_matches)] 12 | #![feature(never_type)] 13 | #![deny(missing_docs)] 14 | 15 | /// TODO: add missing doc. 16 | pub mod ping_pong; 17 | pub mod proc_actor; 18 | /// TODO: add missing doc. 19 | pub mod scheduler; 20 | /// TODO: add missing doc. 21 | pub mod supervision; 22 | /// TODO: add missing doc. 23 | pub mod system; 24 | pub mod system_actor; 25 | 26 | /// py-spy wrapper. 27 | pub mod pyspy; 28 | 29 | pub use hyperactor::actor; 30 | pub use system::System; 31 | -------------------------------------------------------------------------------- /hyperactor_multiprocess/src/scheduler.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use async_trait::async_trait; 10 | 11 | /// TODO: add missing doc 12 | #[async_trait] 13 | pub trait Scheduler { 14 | /// TODO: add missing doc 15 | type GangHandle; 16 | /// TODO: add missing doc 17 | async fn schedule_gang(&self, size: u64) -> Result; 18 | } 19 | 20 | /// TODO: add missing doc 21 | pub struct UnimplementedScheduler; 22 | 23 | #[async_trait] 24 | impl Scheduler for UnimplementedScheduler { 25 | type GangHandle = !; 26 | 27 | async fn schedule_gang(&self, _size: u64) -> Result { 28 | unimplemented!() 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /hyperactor_telemetry/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/hyperactor_telemetry:hyperactor_telemetry 2 | 3 | [package] 4 | name = "hyperactor_telemetry" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [dependencies] 11 | anyhow = "1.0.95" 12 | dashmap = { version = "5.5.3", features = ["rayon", "serde"] } 13 | fbinit = { version = "0.2.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main", optional = true } 14 | hdrhistogram = "7.5" 15 | lazy_static = { version = "1.5", features = ["spin_no_std"], default-features = false } 16 | opentelemetry = "0.29" 17 | opentelemetry_sdk = { version = "0.29.0", features = ["rt-tokio"] } 18 | rand = { version = "0.8", features = ["small_rng"] } 19 | scuba = { version = "0.1.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main", optional = true } 20 | serde = { version = "1.0.185", features = ["derive", "rc"] } 21 | serde_json = { version = "1.0.140", features = ["float_roundtrip", "unbounded_depth"] } 22 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 23 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 24 | tracing-appender = "0.2.3" 25 | tracing-core = { version = "0.1.33", features = ["valuable"] } 26 | tracing-glog = { version = "0.4.0", features = ["ansi", "tracing-log"] } 27 | tracing-subscriber = { version = "0.3.19", features = ["chrono", "env-filter", "json", "local-time", "parking_lot", "registry"] } 28 | 29 | [features] 30 | default = [] 31 | fbcode_build = ["fbinit", "scuba"] 32 | 33 | [lints] 34 | rust = { unexpected_cfgs = { check-cfg = ["cfg(fbcode_build)"], level = "warn" } } 35 | -------------------------------------------------------------------------------- /hyperactor_telemetry/src/otel.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | pub fn tracing_layer< 10 | S: tracing::Subscriber + for<'span> tracing_subscriber::registry::LookupSpan<'span>, 11 | >() -> Option> { 12 | #[cfg(fbcode_build)] 13 | { 14 | Some(crate::meta::tracing_layer()) 15 | } 16 | #[cfg(not(fbcode_build))] 17 | { 18 | None:: + Send + Sync>> 19 | } 20 | } 21 | 22 | pub fn init_metrics() { 23 | #[cfg(fbcode_build)] 24 | { 25 | opentelemetry::global::set_meter_provider(crate::meta::meter_provider()); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /hyperactor_telemetry/src/pool.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use std::sync::mpmc::Receiver; 10 | use std::sync::mpmc::Sender; 11 | use std::sync::mpmc::sync_channel; 12 | use std::sync::mpsc::TryRecvError; 13 | 14 | /// A basic thread-safe pool of objects with a fixed capacity. 15 | /// The implementation uses a simple mpmc channel to store the 16 | /// objects. 17 | #[derive(Debug)] 18 | pub(crate) struct Pool { 19 | sender: Sender, 20 | receiver: Receiver, 21 | } 22 | 23 | impl Pool { 24 | pub(crate) fn new(cap: usize) -> Self { 25 | let (sender, receiver) = sync_channel(cap); 26 | Self { sender, receiver } 27 | } 28 | 29 | pub(crate) fn get(&self) -> T { 30 | match self.receiver.try_recv() { 31 | Ok(val) => val, 32 | Err(TryRecvError::Empty) => Default::default(), 33 | Err(TryRecvError::Disconnected) => panic!("channel disconnected"), 34 | } 35 | } 36 | 37 | #[allow(dead_code)] 38 | pub(crate) fn put(&self, value: T) { 39 | let _ = self.sender.try_send(value); 40 | } 41 | } 42 | 43 | // Manual implementation to avoid demanding T: Clone 44 | impl Clone for Pool { 45 | fn clone(&self) -> Self { 46 | Self { 47 | sender: self.sender.clone(), 48 | receiver: self.receiver.clone(), 49 | } 50 | } 51 | } 52 | 53 | #[cfg(test)] 54 | mod tests { 55 | use super::*; 56 | 57 | #[test] 58 | fn test_basic() { 59 | let pool: Pool = Pool::new(2); 60 | 61 | assert_eq!(pool.get(), 0); 62 | pool.put(1); 63 | assert_eq!(pool.get(), 1); 64 | pool.put(2); 65 | assert_eq!(pool.get(), 2); 66 | assert_eq!(pool.get(), 0); 67 | pool.put(3); 68 | pool.put(4); 69 | pool.put(5); 70 | assert_eq!(pool.get(), 3); 71 | assert_eq!(pool.get(), 4); 72 | assert_eq!(pool.get(), 0); 73 | 74 | pool.put(3); 75 | assert_eq!(pool.clone().get(), 3); 76 | pool.clone().put(123); 77 | assert_eq!(pool.get(), 123); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /hyperactor_telemetry/stubs/fbinit/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | //! Stub implementation of fbinit for OSS builds 10 | //! 11 | //! This is a minimal implementation that provides the necessary API surface 12 | //! for code that depends on fbinit, but doesn't actually do anything. 13 | 14 | /// A stub for the fbinit context 15 | #[derive(Clone, Copy, Debug)] 16 | pub struct FacebookInit; 17 | 18 | /// A trait for types that require fbinit 19 | pub trait MainWithFbinit { 20 | fn init_and_run(self, _fb: FacebookInit) -> i32; 21 | } 22 | 23 | /// Initialize the Facebook runtime (stub implementation) 24 | pub fn initialize_with_client_logging(_args: &[&str]) -> FacebookInit { 25 | FacebookInit 26 | } 27 | 28 | /// Initialize the Facebook runtime (stub implementation) 29 | pub fn initialize() -> FacebookInit { 30 | FacebookInit 31 | } 32 | 33 | /// Run a function with fbinit (stub implementation) 34 | pub fn run_with_init(f: F) -> R 35 | where 36 | F: FnOnce(FacebookInit) -> R, 37 | { 38 | f(FacebookInit) 39 | } 40 | -------------------------------------------------------------------------------- /hyperactor_telemetry/tester/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/hyperactor_telemetry/tester:tester 2 | 3 | [package] 4 | name = "tester" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [[bin]] 11 | name = "tester" 12 | path = "main.rs" 13 | 14 | [dependencies] 15 | hyperactor_telemetry = { version = "0.0.0", path = ".." } 16 | opentelemetry = "0.29" 17 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 18 | 19 | [lints] 20 | rust = { unexpected_cfgs = { check-cfg = ["cfg(fbcode_build)"], level = "warn" } } 21 | -------------------------------------------------------------------------------- /hyperactor_telemetry/tester/main.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use hyperactor_telemetry::declare_static_counter; 10 | use hyperactor_telemetry::declare_static_gauge; 11 | use hyperactor_telemetry::declare_static_histogram; 12 | use hyperactor_telemetry::initialize_logging; 13 | 14 | // Declare static metrics for testing 15 | declare_static_counter!(REQUEST_COUNT, "test_requests"); 16 | declare_static_gauge!(MEMORY_USAGE, "test_memory_usage"); 17 | declare_static_histogram!(REQUEST_DURATION, "test_request_duration"); 18 | 19 | #[tracing::instrument] 20 | fn something_an_actor_would_do() { 21 | tracing::debug!("debug message"); 22 | } 23 | 24 | fn main() { 25 | // Initialize logging with default configuration 26 | initialize_logging(); 27 | tracing::info!("info log"); 28 | } 29 | -------------------------------------------------------------------------------- /monarch_extension/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/monarch_extension:monarch_extension-lib 2 | 3 | [package] 4 | name = "monarch_extension" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [lib] 11 | name = "_rust_bindings" 12 | test = false 13 | doctest = false 14 | crate-type = ["cdylib"] 15 | 16 | [dependencies] 17 | anyhow = "1.0.95" 18 | clap = { version = "4.5.38", features = ["derive", "env", "string", "unicode", "wrap_help"] } 19 | controller = { version = "0.0.0", path = "../controller" } 20 | hyperactor = { version = "0.0.0", path = "../hyperactor" } 21 | hyperactor_extension = { version = "0.0.0", path = "../hyperactor_extension" } 22 | hyperactor_multiprocess = { version = "0.0.0", path = "../hyperactor_multiprocess" } 23 | monarch_hyperactor = { version = "0.0.0", path = "../monarch_hyperactor" } 24 | monarch_messages = { version = "0.0.0", path = "../monarch_messages" } 25 | monarch_simulator_lib = { version = "0.0.0", path = "../monarch_simulator" } 26 | monarch_tensor_worker = { version = "0.0.0", path = "../monarch_tensor_worker" } 27 | monarch_types = { version = "0.0.0", path = "../monarch_types" } 28 | nccl-sys = { path = "../nccl-sys" } 29 | ndslice = { version = "0.0.0", path = "../ndslice" } 30 | pyo3 = { version = "0.22.6", features = ["anyhow"] } 31 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 32 | torch-sys = { version = "0.0.0", path = "../torch-sys" } 33 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 34 | -------------------------------------------------------------------------------- /monarch_extension/build.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | fn main() { 10 | // `torch-sys` will set this env var through Cargo `links` metadata. 11 | let lib_path = std::env::var("DEP_TORCH_LIB_PATH").expect("DEP_TORCH_LIB_PATH to be set"); 12 | // Set the rpath so that the dynamic linker can find libtorch and friends. 13 | println!("cargo::rustc-link-arg=-Wl,-rpath,{lib_path}"); 14 | 15 | if let Ok(path) = std::env::var("DEP_NCCL_LIB_PATH") { 16 | println!("cargo::rustc-link-arg=-Wl,-rpath,{path}"); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /monarch_extension/src/panic.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use pyo3::prelude::*; 10 | 11 | /// A function that panics when called. 12 | /// This is used for testing panic handling in the Python bindings. 13 | #[pyfunction] 14 | pub fn panicking_function() { 15 | panic!("This is a deliberate panic from panicking_function"); 16 | } 17 | 18 | /// Register Python bindings for the panic module. 19 | pub fn register_python_bindings(module: &Bound<'_, PyModule>) -> PyResult<()> { 20 | let f = wrap_pyfunction!(panicking_function, module)?; 21 | f.setattr( 22 | "__module__", 23 | "monarch._rust_bindings.monarch_extension.panic", 24 | )?; 25 | module.add_function(f)?; 26 | Ok(()) 27 | } 28 | -------------------------------------------------------------------------------- /monarch_hyperactor/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/monarch_hyperactor:[monarch_hyperactor,process_allocator-oss] 2 | 3 | [package] 4 | name = "monarch_hyperactor" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [dependencies] 11 | anyhow = "1.0.95" 12 | async-trait = "0.1.86" 13 | bincode = "1.3.3" 14 | clap = { version = "4.5.38", features = ["derive", "env", "string", "unicode", "wrap_help"] } 15 | hyperactor = { version = "0.0.0", path = "../hyperactor" } 16 | hyperactor_extension = { version = "0.0.0", path = "../hyperactor_extension" } 17 | hyperactor_mesh = { version = "0.0.0", path = "../hyperactor_mesh" } 18 | hyperactor_multiprocess = { version = "0.0.0", path = "../hyperactor_multiprocess" } 19 | hyperactor_telemetry = { version = "0.0.0", path = "../hyperactor_telemetry" } 20 | monarch_types = { version = "0.0.0", path = "../monarch_types" } 21 | ndslice = { version = "0.0.0", path = "../ndslice" } 22 | pyo3 = { version = "0.22.6", features = ["anyhow"] } 23 | pyo3-async-runtimes = { git = "https://github.com/PyO3/pyo3-async-runtimes", rev = "f6bb9b471a5b7765dd770af36e83f26802459621", features = ["attributes", "tokio-runtime"] } 24 | serde = { version = "1.0.185", features = ["derive", "rc"] } 25 | serde_bytes = "0.11" 26 | thiserror = "2.0.12" 27 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 28 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 29 | -------------------------------------------------------------------------------- /monarch_hyperactor/src/actor_mesh.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use std::sync::Arc; 10 | 11 | use hyperactor::ActorRef; 12 | use hyperactor_mesh::Mesh; 13 | use hyperactor_mesh::RootActorMesh; 14 | use hyperactor_mesh::actor_mesh::ActorMesh; 15 | use pyo3::exceptions::PyException; 16 | use pyo3::prelude::*; 17 | 18 | use crate::actor::PythonActor; 19 | use crate::actor::PythonMessage; 20 | use crate::mailbox::PyMailbox; 21 | use crate::proc::PyActorId; 22 | use crate::shape::PyShape; 23 | 24 | #[pyclass( 25 | name = "PythonActorMesh", 26 | module = "monarch._rust_bindings.monarch_hyperactor.actor_mesh" 27 | )] 28 | pub struct PythonActorMesh { 29 | pub(super) inner: Arc>, 30 | pub(super) client: PyMailbox, 31 | } 32 | 33 | #[pymethods] 34 | impl PythonActorMesh { 35 | fn cast(&self, message: &PythonMessage) -> PyResult<()> { 36 | use ndslice::selection::dsl::*; 37 | self.inner 38 | .cast(all(true_()), message.clone()) 39 | .map_err(|err| PyException::new_err(err.to_string()))?; 40 | Ok(()) 41 | } 42 | 43 | // Consider defining a "PythonActorRef", which carries specifically 44 | // a reference to python message actors. 45 | fn get(&self, rank: usize) -> Option { 46 | self.inner 47 | .get(rank) 48 | .map(ActorRef::into_actor_id) 49 | .map(PyActorId::from) 50 | } 51 | 52 | #[getter] 53 | fn client(&self) -> PyMailbox { 54 | self.client.clone() 55 | } 56 | 57 | #[getter] 58 | fn shape(&self) -> PyShape { 59 | PyShape::from(self.inner.shape().clone()) 60 | } 61 | } 62 | pub fn register_python_bindings(hyperactor_mod: &Bound<'_, PyModule>) -> PyResult<()> { 63 | hyperactor_mod.add_class::()?; 64 | Ok(()) 65 | } 66 | -------------------------------------------------------------------------------- /monarch_hyperactor/src/bin/process_allocator/main.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | mod common; 10 | 11 | use clap::Parser; 12 | use common::Args; 13 | use common::main_impl; 14 | use hyperactor::channel::ChannelAddr; 15 | 16 | #[tokio::main] 17 | async fn main() { 18 | let args = Args::parse(); 19 | hyperactor::initialize(); 20 | 21 | let bind = format!("{}:{}", args.addr, args.port); 22 | let socket_addr: std::net::SocketAddr = bind.parse().unwrap(); 23 | let serve_address = ChannelAddr::Tcp(socket_addr); 24 | 25 | let _ = main_impl(serve_address, args.program).await.unwrap(); 26 | } 27 | -------------------------------------------------------------------------------- /monarch_hyperactor/src/bootstrap.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use hyperactor_mesh::bootstrap_or_die; 10 | use pyo3::Bound; 11 | use pyo3::PyAny; 12 | use pyo3::PyResult; 13 | use pyo3::Python; 14 | use pyo3::pyfunction; 15 | use pyo3::types::PyAnyMethods; 16 | use pyo3::types::PyModule; 17 | use pyo3::types::PyModuleMethods; 18 | use pyo3::wrap_pyfunction; 19 | 20 | #[pyfunction] 21 | #[pyo3(signature = ())] 22 | pub fn bootstrap_main(py: Python) -> PyResult> { 23 | hyperactor::tracing::debug!("entering async bootstrap"); 24 | pyo3_async_runtimes::tokio::future_into_py::<_, ()>(py, async move { 25 | bootstrap_or_die().await; 26 | }) 27 | } 28 | 29 | pub fn register_python_bindings(hyperactor_mod: &Bound<'_, PyModule>) -> PyResult<()> { 30 | let f = wrap_pyfunction!(bootstrap_main, hyperactor_mod)?; 31 | f.setattr( 32 | "__module__", 33 | "monarch._rust_bindings.monarch_hyperactor.bootstrap", 34 | )?; 35 | hyperactor_mod.add_function(f)?; 36 | 37 | Ok(()) 38 | } 39 | -------------------------------------------------------------------------------- /monarch_hyperactor/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #![allow(unsafe_op_in_unsafe_fn)] 10 | 11 | pub mod actor; 12 | pub mod actor_mesh; 13 | pub mod alloc; 14 | pub mod bootstrap; 15 | pub mod mailbox; 16 | pub mod ndslice; 17 | pub mod proc; 18 | pub mod proc_mesh; 19 | pub mod runtime; 20 | pub mod selection; 21 | pub mod shape; 22 | -------------------------------------------------------------------------------- /monarch_hyperactor/src/selection.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use ndslice::selection::Selection; 10 | use pyo3::PyResult; 11 | use pyo3::prelude::*; 12 | use pyo3::types::PyType; 13 | 14 | #[pyclass( 15 | name = "Selection", 16 | module = "monarch._rust_bindings.monarch_hyperactor.selection", 17 | frozen 18 | )] 19 | pub struct PySelection { 20 | inner: Selection, 21 | } 22 | 23 | impl From for PySelection { 24 | fn from(inner: Selection) -> Self { 25 | Self { inner } 26 | } 27 | } 28 | 29 | #[pymethods] 30 | impl PySelection { 31 | #[getter] 32 | fn __repr__(&self) -> String { 33 | format!("{:?}", self.inner) 34 | } 35 | 36 | #[classmethod] 37 | #[pyo3(name = "from_string")] 38 | pub fn parse(_cls: Bound<'_, PyType>, input: &str) -> PyResult { 39 | let selection = ndslice::selection::parse::parse(input).map_err(|err| { 40 | pyo3::exceptions::PyValueError::new_err(format!("parse error: {err}")) 41 | })?; 42 | 43 | Ok(PySelection::from(selection)) 44 | } 45 | } 46 | 47 | pub fn register_python_bindings(module: &Bound<'_, PyModule>) -> PyResult<()> { 48 | module.add_class::()?; 49 | Ok(()) 50 | } 51 | -------------------------------------------------------------------------------- /monarch_messages/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/monarch_messages:monarch_messages 2 | 3 | [package] 4 | name = "monarch_messages" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [dependencies] 11 | anyhow = "1.0.95" 12 | derive_more = { version = "1.0.0", features = ["full"] } 13 | enum-as-inner = "0.6.0" 14 | hyperactor = { version = "0.0.0", path = "../hyperactor" } 15 | monarch_types = { version = "0.0.0", path = "../monarch_types" } 16 | ndslice = { version = "0.0.0", path = "../ndslice" } 17 | pyo3 = { version = "0.22.6", features = ["anyhow"] } 18 | serde = { version = "1.0.185", features = ["derive", "rc"] } 19 | serde_bytes = "0.11" 20 | thiserror = "2.0.12" 21 | torch-sys = { version = "0.0.0", path = "../torch-sys" } 22 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 23 | 24 | [dev-dependencies] 25 | paste = "1.0.14" 26 | -------------------------------------------------------------------------------- /monarch_messages/build.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | fn main() { 10 | // `torch-sys` will set this env var through Cargo `links` metadata. 11 | let lib_path = std::env::var("DEP_TORCH_LIB_PATH").expect("DEP_TORCH_LIB_PATH to be set"); 12 | // Set the rpath so that the dynamic linker can find libtorch and friends. 13 | println!("cargo::rustc-link-arg=-Wl,-rpath,{lib_path}"); 14 | } 15 | -------------------------------------------------------------------------------- /monarch_messages/src/debugger.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | // NOTE: Until https://github.com/PyO3/pyo3/pull/4674, `pyo3::pymethods` trigger 10 | // and unsafe-op-in-unsafe-fn warnings. 11 | #![allow(unsafe_op_in_unsafe_fn)] 12 | 13 | use derive_more::From; 14 | use hyperactor::Named; 15 | use pyo3::Bound; 16 | use pyo3::PyResult; 17 | use pyo3::types::PyModule; 18 | use pyo3::types::PyModuleMethods; 19 | use serde::Deserialize; 20 | use serde::Serialize; 21 | 22 | pub fn register_python_bindings(debugger: &Bound<'_, PyModule>) -> PyResult<()> { 23 | debugger.add_class::()?; 24 | Ok(()) 25 | } 26 | 27 | /// Enumerates the actions relevant to PDB debugging sessions. 28 | #[derive(Debug, Deserialize, Clone, Serialize, PartialEq)] 29 | #[pyo3::pyclass(frozen, module = "monarch._rust_bindings.monarch_messages.debugger")] 30 | pub enum DebuggerAction { 31 | /// Sent from worker to client to indicate that the worker has entered 32 | /// a pdb debugging session. 33 | Paused(), 34 | 35 | /// Sent from client to worker to indicate that the client has started 36 | /// the debugging session. 37 | Attach(), 38 | 39 | /// Sent to client or to worker to end the debugging session. 40 | Detach(), 41 | 42 | /// Sent to client or to worker to write bytes to receiver's stdout. 43 | Write { 44 | #[serde(with = "serde_bytes")] 45 | bytes: Vec, 46 | }, 47 | 48 | /// Sent from worker to client to read bytes from client's stdin. 49 | Read { requested_size: usize }, 50 | } 51 | 52 | #[derive(Serialize, Deserialize, Debug, Clone, Named, From)] 53 | pub enum DebuggerMessage { 54 | Action { action: DebuggerAction }, 55 | } 56 | -------------------------------------------------------------------------------- /monarch_messages/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #![feature(assert_matches)] 10 | 11 | pub mod client; 12 | pub mod controller; 13 | pub mod debugger; 14 | pub mod wire_value; 15 | pub mod worker; 16 | -------------------------------------------------------------------------------- /monarch_messages/test_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import math 8 | 9 | 10 | def has_nan(t): 11 | return math.isnan(t) 12 | -------------------------------------------------------------------------------- /monarch_meta_extension/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/monarch_meta_extension:monarch_meta_extension-lib 2 | 3 | [package] 4 | name = "monarch_meta_extension" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [lib] 11 | name = "_lib_meta" 12 | test = false 13 | doctest = false 14 | crate-type = ["cdylib"] 15 | 16 | [dependencies] 17 | anyhow = "1.0.95" 18 | async-trait = "0.1.86" 19 | fbinit = { version = "0.2.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main" } 20 | hyperactor = { version = "0.0.0", path = "../hyperactor" } 21 | hyperactor_extension = { version = "0.0.0", path = "../hyperactor_extension" } 22 | hyperactor_mesh = { version = "0.0.0", path = "../hyperactor_mesh" } 23 | pyo3 = { version = "0.22.6", features = ["anyhow"] } 24 | pyo3-async-runtimes = { git = "https://github.com/PyO3/pyo3-async-runtimes", rev = "f6bb9b471a5b7765dd770af36e83f26802459621", features = ["attributes", "tokio-runtime"] } 25 | serde_derive = "1.0.185" 26 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 27 | -------------------------------------------------------------------------------- /monarch_meta_extension/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #![allow(unsafe_op_in_unsafe_fn)] 10 | 11 | pub mod alloc; 12 | pub mod alloc_mock; 13 | 14 | use pyo3::prelude::*; 15 | 16 | #[pymodule] 17 | #[pyo3(name = "_lib_meta")] 18 | pub fn mod_init(module: &Bound<'_, PyModule>) -> PyResult<()> { 19 | //Safety: This needs to be called here because we can't use fbinit::main 20 | unsafe { 21 | fbinit::perform_init(); 22 | } 23 | 24 | ::hyperactor::initialize(); 25 | 26 | let hyperactor_mod = PyModule::new_bound(module.py(), "hyperactor_meta")?; 27 | alloc::register_python_bindings(&hyperactor_mod)?; 28 | alloc_mock::register_python_bindings(&hyperactor_mod)?; 29 | 30 | module.add_submodule(&hyperactor_mod)?; 31 | Ok(()) 32 | } 33 | -------------------------------------------------------------------------------- /monarch_rdma/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/monarch_rdma:monarch_rdma 2 | 3 | [package] 4 | name = "monarch_rdma" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [dependencies] 11 | anyhow = "1.0.95" 12 | async-trait = "0.1.86" 13 | hyperactor = { version = "0.0.0", path = "../hyperactor" } 14 | hyperactor_mesh = { version = "0.0.0", path = "../hyperactor_mesh" } 15 | ibverbs = "0.7.1" 16 | ndslice = { version = "0.0.0", path = "../ndslice" } 17 | rand = { version = "0.8", features = ["small_rng"] } 18 | serde = { version = "1.0.185", features = ["derive", "rc"] } 19 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 20 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 21 | 22 | [dev-dependencies] 23 | timed_test = { version = "0.0.0", path = "../timed_test" } 24 | -------------------------------------------------------------------------------- /monarch_rdma/examples/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/monarch_rdma/examples:[parameter_server,parameter_server_bootstrap,parameter_server_example] 2 | 3 | [package] 4 | name = "parameter_server" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [lib] 11 | path = "parameter_server.rs" 12 | 13 | [[bin]] 14 | name = "parameter_server_bootstrap" 15 | path = "bootstrap.rs" 16 | 17 | [[bin]] 18 | name = "parameter_server_example" 19 | path = "main.rs" 20 | 21 | [dependencies] 22 | anyhow = "1.0.95" 23 | async-trait = "0.1.86" 24 | buck-resources = "1" 25 | hyperactor = { version = "0.0.0", path = "../../hyperactor" } 26 | hyperactor_mesh = { version = "0.0.0", path = "../../hyperactor_mesh" } 27 | monarch_rdma = { version = "0.0.0", path = ".." } 28 | ndslice = { version = "0.0.0", path = "../../ndslice" } 29 | serde = { version = "1.0.185", features = ["derive", "rc"] } 30 | timed_test = { version = "0.0.0", path = "../../timed_test" } 31 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 32 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 33 | tracing-subscriber = { version = "0.3.19", features = ["chrono", "env-filter", "json", "local-time", "parking_lot", "registry"] } 34 | -------------------------------------------------------------------------------- /monarch_rdma/examples/bootstrap.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #![allow(unused)] 10 | use std::hint::black_box; 11 | 12 | use monarch_rdma::RdmaManagerActor; 13 | use parameter_server::ParameterServerActor; 14 | use parameter_server::WorkerActor; 15 | 16 | /// This is an "empty shell" bootstrap process, 17 | /// simply invoking [`hyperactor_mesh::bootstrap_or_die`]. 18 | #[tokio::main] 19 | async fn main() { 20 | tracing_subscriber::fmt() 21 | .with_max_level(tracing::Level::INFO) 22 | .init(); 23 | // The following black_box lines force-link the actors needed for the parameter server 24 | // example to run. Relying on side-effects for actor registration is not consistent across 25 | // all build modes. 26 | let _ = black_box::>(None); 27 | let _ = black_box::>(None); 28 | let _ = black_box::>(None); 29 | hyperactor_mesh::bootstrap_or_die().await; 30 | } 31 | -------------------------------------------------------------------------------- /monarch_rdma/examples/main.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | //! Main running script for parameter server example. 10 | //! 11 | //! This script needs to be kept separate to avoid buck naming collisions. 12 | //! 13 | //! Specifically, parameter_server::run uses ProcAllocator, which spawns 14 | //! the binary defined in //monarch/examples/rdma/bootstrap.rs. 15 | //! 16 | //! If this main script was kept in the same file as parameter_server.rs, then 17 | //! spawning the actors defined in parameter_server would be named e.g. 18 | //! "parameter_server_example::ParameterServerActor", whereas the bootstrap binary 19 | //! expects this to be named "parameter_server::ParameterServerActor". 20 | //! 21 | //! Keeping this file separate allows us to avoid this naming collision. 22 | use parameter_server::run; 23 | 24 | #[tokio::main] 25 | async fn main() -> Result<(), anyhow::Error> { 26 | run(4, 5).await 27 | } 28 | -------------------------------------------------------------------------------- /monarch_rdma/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | mod ibverbs_primitives; 10 | mod rdma_buffer; 11 | mod rdma_components; 12 | mod rdma_manager_actor; 13 | 14 | pub use ibverbs_primitives::*; 15 | pub use rdma_buffer::*; 16 | pub use rdma_components::*; 17 | pub use rdma_manager_actor::*; 18 | -------------------------------------------------------------------------------- /monarch_simulator/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/monarch_simulator:[monarch_simulator,monarch_simulator_lib] 2 | 3 | [package] 4 | name = "monarch_simulator_lib" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [[bin]] 11 | name = "monarch_simulator" 12 | path = "src/main.rs" 13 | 14 | [dependencies] 15 | anyhow = "1.0.95" 16 | async-trait = "0.1.86" 17 | clap = { version = "4.5.38", features = ["derive", "env", "string", "unicode", "wrap_help"] } 18 | controller = { version = "0.0.0", path = "../controller" } 19 | dashmap = { version = "5.5.3", features = ["rayon", "serde"] } 20 | futures = { version = "0.3.30", features = ["async-await", "compat"] } 21 | hyperactor = { version = "0.0.0", path = "../hyperactor" } 22 | hyperactor_multiprocess = { version = "0.0.0", path = "../hyperactor_multiprocess" } 23 | lazy_static = { version = "1.5", features = ["spin_no_std"], default-features = false } 24 | monarch_messages = { version = "0.0.0", path = "../monarch_messages" } 25 | monarch_tensor_worker = { version = "0.0.0", path = "../monarch_tensor_worker" } 26 | monarch_types = { version = "0.0.0", path = "../monarch_types" } 27 | ndslice = { version = "0.0.0", path = "../ndslice" } 28 | serde = { version = "1.0.185", features = ["derive", "rc"] } 29 | serde_json = { version = "1.0.140", features = ["float_roundtrip", "unbounded_depth"] } 30 | thiserror = "2.0.12" 31 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 32 | torch-sys = { version = "0.0.0", path = "../torch-sys" } 33 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 34 | 35 | [dev-dependencies] 36 | rand = { version = "0.8", features = ["small_rng"] } 37 | tracing-test = { version = "0.2.3", features = ["no-env-filter"] } 38 | -------------------------------------------------------------------------------- /monarch_simulator/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use hyperactor::actor::ActorError; 10 | use hyperactor::simnet::SimNetError; 11 | 12 | pub mod bootstrap; 13 | mod collective_coordinator; 14 | pub mod controller; 15 | pub mod simulator; 16 | pub mod worker; 17 | 18 | /// The type of error that can occur on channel operations. 19 | #[derive(thiserror::Error, Debug)] 20 | pub enum SimulatorError { 21 | /// Error during simnet operation. 22 | #[error(transparent)] 23 | SimNetError(#[from] SimNetError), 24 | 25 | /// Error during actor operations. 26 | #[error(transparent)] 27 | ActorError(#[from] ActorError), 28 | 29 | /// Simulator cannot find the world with given name. 30 | #[error("World {0} not found")] 31 | WorldNotFound(String), 32 | 33 | /// Cannot find the mesh in simulator. 34 | #[error("Mesh not found {0}")] 35 | MeshNotFound(String), 36 | } 37 | -------------------------------------------------------------------------------- /monarch_simulator/src/main.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | //! A binary to launch the simulated Monarch controller along with necessary environment. 10 | use std::process::ExitCode; 11 | 12 | use anyhow::Result; 13 | use clap::Parser; 14 | use hyperactor::channel::ChannelAddr; 15 | use monarch_simulator_lib::bootstrap::bootstrap; 16 | 17 | #[derive(Debug, Parser)] 18 | struct Args { 19 | #[arg(short, long)] 20 | system_addr: ChannelAddr, 21 | #[arg(short, long)] 22 | proxy_addr: ChannelAddr, 23 | } 24 | 25 | const TITLE: &str = r#" 26 | ****************************************************** 27 | * * 28 | * ____ ___ __ __ _ _ _ _ _____ ___ ____ * 29 | */ ___|_ _| \/ | | | | | / \|_ _/ _ \| _ \ * 30 | *\___ \| || |\/| | | | | | / _ \ | || | | | |_) |* 31 | * ___) | || | | | |_| | |___ / ___ \| || |_| | _ < * 32 | *|____/___|_| |_|\___/|_____/_/ \_\_| \___/|_| \_\* 33 | * * 34 | ****************************************************** 35 | "#; 36 | 37 | #[tokio::main] 38 | async fn main() -> Result { 39 | eprintln!("{}", TITLE); 40 | hyperactor::initialize(); 41 | let args = Args::parse(); 42 | 43 | let system_addr = args.system_addr.clone(); 44 | let proxy_addr = args.proxy_addr.clone(); 45 | tracing::info!("starting Monarch simulation"); 46 | 47 | let operational_listener_handle = bootstrap(system_addr, proxy_addr, 1).await?; 48 | 49 | operational_listener_handle 50 | .await 51 | .expect("simulator exited unexpectedly"); 52 | 53 | Ok(ExitCode::SUCCESS) 54 | } 55 | -------------------------------------------------------------------------------- /monarch_tensor_worker/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/monarch_tensor_worker:monarch_tensor_worker 2 | 3 | [package] 4 | name = "monarch_tensor_worker" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [dependencies] 11 | anyhow = "1.0.95" 12 | async-trait = "0.1.86" 13 | bincode = "1.3.3" 14 | clap = { version = "4.5.38", features = ["derive", "env", "string", "unicode", "wrap_help"] } 15 | cxx = "1.0.119" 16 | derive_more = { version = "1.0.0", features = ["full"] } 17 | futures = { version = "0.3.30", features = ["async-await", "compat"] } 18 | hyperactor = { version = "0.0.0", path = "../hyperactor" } 19 | hyperactor_multiprocess = { version = "0.0.0", path = "../hyperactor_multiprocess" } 20 | itertools = "0.14.0" 21 | monarch_messages = { version = "0.0.0", path = "../monarch_messages" } 22 | monarch_types = { version = "0.0.0", path = "../monarch_types" } 23 | ndslice = { version = "0.0.0", path = "../ndslice" } 24 | nix = { version = "0.29.0", features = ["dir", "event", "hostname", "inotify", "ioctl", "mman", "mount", "net", "poll", "ptrace", "reboot", "resource", "sched", "signal", "term", "time", "user", "zerocopy"] } 25 | parking_lot = { version = "0.12.1", features = ["send_guard"] } 26 | pyo3 = { version = "0.22.6", features = ["anyhow"] } 27 | serde = { version = "1.0.185", features = ["derive", "rc"] } 28 | serde_json = { version = "1.0.140", features = ["float_roundtrip", "unbounded_depth"] } 29 | sorted-vec = "0.8.3" 30 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 31 | torch-sys = { version = "0.0.0", path = "../torch-sys" } 32 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 33 | tracing-subscriber = { version = "0.3.19", features = ["chrono", "env-filter", "json", "local-time", "parking_lot", "registry"] } 34 | 35 | [dev-dependencies] 36 | indoc = "2.0.2" 37 | rand = { version = "0.8", features = ["small_rng"] } 38 | timed_test = { version = "0.0.0", path = "../timed_test" } 39 | tokio-retry = "0.3" 40 | -------------------------------------------------------------------------------- /monarch_tensor_worker/build.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | fn main() { 10 | // `torch-sys` will set this env var through Cargo `links` metadata. 11 | let lib_path = std::env::var("DEP_TORCH_LIB_PATH").expect("DEP_TORCH_LIB_PATH to be set"); 12 | // Set the rpath so that the dynamic linker can find libtorch and friends. 13 | println!("cargo::rustc-link-arg=-Wl,-rpath,{lib_path}"); 14 | } 15 | -------------------------------------------------------------------------------- /monarch_tensor_worker/src/test_util.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use std::io::IsTerminal; 10 | 11 | use anyhow::Result; 12 | use pyo3::Python; 13 | use tracing_subscriber::fmt::format::FmtSpan; 14 | 15 | pub fn test_setup() -> Result<()> { 16 | let _ = tracing_subscriber::fmt() 17 | .with_thread_ids(true) 18 | .with_span_events(FmtSpan::NEW | FmtSpan::CLOSE) 19 | .with_max_level(tracing::Level::DEBUG) 20 | .with_ansi(std::io::stderr().is_terminal()) 21 | .with_writer(std::io::stderr) 22 | .try_init(); 23 | 24 | // Redirect NCCL_DEBUG log output to a file so it doesn't clash on stdout. 25 | // TestX requires stdout to have JSON output on individual lines, and 26 | // the NCCL output is not JSON. Because it runs in a different thread, it'll 27 | // race on writing to stdout. 28 | // Do this regardless of whether NCCL_DEBUG is set or not, because it can 29 | // be set after this point in the test. If it doesn't get set, NCCL_DEBUG_FILE 30 | // will be ignored. 31 | // %h becomes hostname, %p becomes pid. 32 | let nccl_debug_file = std::env::temp_dir().join("nccl_debug.%h.%p"); 33 | tracing::debug!("Set NCCL_DEBUG_FILE to {:?}", nccl_debug_file); 34 | // Safety: Can be unsound if there are multiple threads 35 | // reading and writing the environment. 36 | unsafe { 37 | std::env::set_var("NCCL_DEBUG_FILE", nccl_debug_file); 38 | } 39 | // NOTE(agallagher): Calling `prepare_freethreaded_python` appears to 40 | // clear `PYTHONPATH` in the env, which we need for test subprocesses 41 | // to work. So, manually preserve it. 42 | let py_path = std::env::var("PYTHONPATH"); 43 | pyo3::prepare_freethreaded_python(); 44 | if let Ok(py_path) = py_path { 45 | // SAFETY: Re-setting env var cleard by `prepare_freethreaded_python`. 46 | unsafe { std::env::set_var("PYTHONPATH", py_path) } 47 | } 48 | 49 | // We need to load torch to initialize some internal structures used by 50 | // the FFI funcs we use to convert ivalues to/from py objects. 51 | Python::with_gil(|py| py.run_bound("import torch", None, None))?; 52 | 53 | Ok(()) 54 | } 55 | -------------------------------------------------------------------------------- /monarch_tensor_worker/test_worker_main.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | """ 8 | Simplified version of worker_main.py for testing the monarch_tensor_worker standalone. 9 | 10 | We want a Python entrypoint here because we want to initialize the Monarch 11 | Python extension on the main thread. 12 | """ 13 | 14 | 15 | def main() -> None: 16 | # torch is import to make sure all the dynamic types are registered 17 | import torch # noqa 18 | 19 | # Force CUDA initialization early on. CUDA init is lazy, and Python CUDA 20 | # APIs are guarded to init CUDA if necessary. But our worker calls 21 | # raw libtorch APIs which are not similarly guarded. So just initialize here 22 | # to avoid issues with potentially using uninitialized CUDA state. 23 | torch.cuda.init() 24 | 25 | from monarch._rust_bindings.monarch_extension import ( # @manual=//monarch/monarch_extension:monarch_extension 26 | tensor_worker, 27 | ) 28 | 29 | # pyre-ignore[16] 30 | tensor_worker.worker_main() 31 | 32 | 33 | if __name__ == "__main__": 34 | # Do not add code here, it won't be run. Add them to the function called below. 35 | main() # pragma: no cover 36 | -------------------------------------------------------------------------------- /monarch_types/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/monarch_types:monarch_types 2 | 3 | [package] 4 | name = "monarch_types" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [dependencies] 11 | derive_more = { version = "1.0.0", features = ["full"] } 12 | hyperactor = { version = "0.0.0", path = "../hyperactor" } 13 | pyo3 = { version = "0.22.6", features = ["anyhow"] } 14 | serde = { version = "1.0.185", features = ["derive", "rc"] } 15 | serde_bytes = "0.11" 16 | 17 | [dev-dependencies] 18 | anyhow = "1.0.95" 19 | timed_test = { version = "0.0.0", path = "../timed_test" } 20 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 21 | -------------------------------------------------------------------------------- /monarch_types/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #![feature(assert_matches)] 10 | 11 | mod pyobject; 12 | mod python; 13 | mod pytree; 14 | 15 | pub use pyobject::PickledPyObject; 16 | pub use python::SerializablePyErr; 17 | pub use python::TryIntoPyObject; 18 | pub use python::TryIntoPyObjectUnsafe; 19 | pub use pytree::PyTree; 20 | -------------------------------------------------------------------------------- /monarch_types/src/pyobject.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use hyperactor::Named; 10 | use pyo3::prelude::*; 11 | use pyo3::types::PyBytes; 12 | use serde::Deserialize; 13 | use serde::Serialize; 14 | 15 | use crate::TryIntoPyObject; 16 | 17 | #[derive(Debug, Clone, Serialize, Deserialize, Named)] 18 | pub struct PickledPyObject(#[serde(with = "serde_bytes")] Vec); 19 | 20 | impl PickledPyObject { 21 | pub fn pickle<'py>(obj: &Bound<'py, PyAny>) -> PyResult { 22 | let bytes = obj 23 | .py() 24 | .import_bound("pickle")? 25 | .call_method1("dumps", (obj,))? 26 | .downcast_into::()? 27 | .as_bytes() 28 | .to_vec(); 29 | Ok(Self(bytes)) 30 | } 31 | 32 | pub fn unpickle<'py>(&self, py: Python<'py>) -> PyResult> { 33 | py.import_bound("pickle")? 34 | .call_method1("loads", (self.0.as_slice(),)) 35 | } 36 | } 37 | 38 | impl TryFrom<&Bound<'_, PyAny>> for PickledPyObject { 39 | type Error = PyErr; 40 | fn try_from(obj: &Bound<'_, PyAny>) -> PyResult { 41 | Self::pickle(obj) 42 | } 43 | } 44 | 45 | impl TryFrom> for PickledPyObject { 46 | type Error = PyErr; 47 | fn try_from(obj: Bound<'_, PyAny>) -> PyResult { 48 | Self::pickle(&obj) 49 | } 50 | } 51 | 52 | impl FromPyObject<'_> for PickledPyObject { 53 | fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult { 54 | PickledPyObject::pickle(obj) 55 | } 56 | } 57 | 58 | impl TryIntoPyObject for &PickledPyObject { 59 | fn try_to_object<'a>(self, py: Python<'a>) -> PyResult> { 60 | self.unpickle(py) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /nccl-sys/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "nccl-sys" 3 | version = "0.0.0" 4 | authors = ["Facebook"] 5 | edition = "2021" 6 | license = "MIT" 7 | links = "nccl" 8 | 9 | [dependencies] 10 | cxx = "1.0.119" 11 | serde = { version = "1.0.185", features = ["derive", "rc"] } 12 | 13 | [build-dependencies] 14 | bindgen = "0.70.1" 15 | which = "6.0.3" 16 | glob = "0.3.1" 17 | -------------------------------------------------------------------------------- /nccl-sys/src/nccl.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | -------------------------------------------------------------------------------- /ndslice/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/ndslice:ndslice 2 | 3 | [package] 4 | name = "ndslice" 5 | version = "0.0.0" 6 | authors = ["Facebook "] 7 | edition = "2021" 8 | description = "data structures to support n-d arrays of ranks" 9 | repository = "https://github.com/pytorch-labs/monarch/" 10 | license = "BSD-3-Clause" 11 | 12 | [dependencies] 13 | anyhow = "1.0.95" 14 | enum-as-inner = "0.6.0" 15 | itertools = "0.14.0" 16 | nom = "8" 17 | proc-macro2 = { version = "1.0.70", features = ["span-locations"] } 18 | quote = "1.0.29" 19 | rand = { version = "0.8", features = ["small_rng"] } 20 | serde = { version = "1.0.185", features = ["derive", "rc"] } 21 | thiserror = "2.0.12" 22 | 23 | [dev-dependencies] 24 | proptest = "1.5" 25 | -------------------------------------------------------------------------------- /ndslice/src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | //! Core mesh components for the hyperactor framework. 10 | //! 11 | //! Provides [`Slice`], a compact representation of a subset of a 12 | //! multidimensional array. See [`Slice`] for more details. 13 | //! 14 | //! This crate defines the foundational abstractions used in 15 | //! hyperactor's mesh layer, including multidimensional shapes and 16 | //! selection algebra. The crate avoids dependencies on procedural 17 | //! macros and other higher-level constructs, enabling reuse in both 18 | //! runtime and macro contexts. 19 | 20 | #![feature(assert_matches)] 21 | #![recursion_limit = "512"] 22 | 23 | mod slice; 24 | pub use slice::DimSliceIterator; 25 | pub use slice::Slice; 26 | pub use slice::SliceError; 27 | pub use slice::SliceIterator; 28 | 29 | /// Selection algebra for describing multidimensional mesh regions. 30 | pub mod selection; 31 | 32 | /// Core types for representing multidimensional shapes and strides. 33 | pub mod shape; 34 | 35 | /// Reshaping transformations for multidimensional slices and shapes. 36 | pub mod reshape; 37 | 38 | /// The selection expression type used to define routing constraints. 39 | pub use selection::Selection; 40 | /// DSL-style constructors for building `Selection` expressions. 41 | pub use selection::dsl; 42 | /// Represents an interval with an optional end and step, used to 43 | /// define extents in `Shape` and coordinate filters in `Selection`. 44 | pub use shape::Range; 45 | /// Describes the size and layout of a multidimensional mesh. 46 | pub use shape::Shape; 47 | /// Errors that can occur during shape construction or validation. 48 | pub use shape::ShapeError; 49 | 50 | /// Property-based generators for randomized test input. 51 | #[cfg(test)] 52 | pub mod strategy; 53 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.pytest.ini_options] 2 | 3 | markers = [ 4 | "oss_skip: marks tests to skip in OSS CI", 5 | ] 6 | asyncio_mode = "auto" 7 | -------------------------------------------------------------------------------- /python/monarch/_monarch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /python/monarch/_monarch/hyperactor/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | import abc 9 | 10 | from monarch._rust_bindings.monarch_hyperactor.actor import PythonMessage 11 | 12 | from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension 13 | LocalAllocatorBase, 14 | ) 15 | 16 | from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox, PortId 17 | 18 | from monarch._rust_bindings.monarch_hyperactor.proc import ( # @manual=//monarch/monarch_extension:monarch_extension 19 | ActorId, 20 | Alloc, 21 | AllocConstraints, 22 | AllocSpec, 23 | init_proc, 24 | Proc, 25 | Serialized, 26 | ) 27 | 28 | from monarch._rust_bindings.monarch_hyperactor.shape import ( # @manual=//monarch/monarch_extension:monarch_extension 29 | Shape, 30 | ) 31 | 32 | 33 | class Actor(abc.ABC): 34 | @abc.abstractmethod 35 | async def handle(self, mailbox: Mailbox, message: PythonMessage) -> None: ... 36 | 37 | async def handle_cast( 38 | self, 39 | mailbox: Mailbox, 40 | rank: int, 41 | coordinates: list[tuple[str, int]], 42 | message: PythonMessage, 43 | ) -> None: 44 | await self.handle(mailbox, message) 45 | 46 | 47 | __all__ = [ 48 | "init_proc", 49 | "Actor", 50 | "ActorId", 51 | "ActorHandle", 52 | "Alloc", 53 | "AllocSpec", 54 | "PortId", 55 | "Proc", 56 | "Serialized", 57 | "PickledMessage", 58 | "PickledMessageClientActor", 59 | "PythonMessage", 60 | "Mailbox", 61 | "PortHandle", 62 | "PortReceiver", 63 | "OncePortHandle", 64 | "OncePortReceiver", 65 | "Alloc", 66 | "AllocSpec", 67 | "AllocConstraints", 68 | "ProcMesh", 69 | "PythonActorMesh", 70 | "ProcessAllocatorBase", 71 | "Shape", 72 | "Selection", 73 | "LocalAllocatorBase", 74 | ] 75 | -------------------------------------------------------------------------------- /python/monarch/_monarch/selection/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from monarch._rust_bindings.monarch_hyperactor.selection import ( # @manual=//monarch/monarch_extension:monarch_extension 8 | Selection, 9 | ) 10 | 11 | __all__ = [ 12 | "Selection", 13 | ] 14 | -------------------------------------------------------------------------------- /python/monarch/_monarch/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/monarch/2876b0c5f131c092549d68c645d2ba37dfde857a/python/monarch/_monarch/worker/__init__.py -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/__init__.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/monarch/2876b0c5f131c092549d68c645d2ba37dfde857a/python/monarch/_rust_bindings/__init__.pyi -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/hyperactor_extension/alloc.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Dict, final, Optional 8 | 9 | class Alloc: 10 | """ 11 | An alloc represents an allocation of procs. Allocs are returned by 12 | one of the allocator implementations, such as `ProcessAllocator` or 13 | `LocalAllocator`. 14 | """ 15 | 16 | @final 17 | class AllocConstraints: 18 | def __init__(self, match_labels: Optional[Dict[str, str]] = None) -> None: 19 | """ 20 | Create a new alloc constraints. 21 | 22 | Arguments: 23 | - `match_labels`: A dictionary of labels to match. If a label is present 24 | in the dictionary, the alloc must have that label and its value 25 | must match the value in the dictionary. 26 | """ 27 | ... 28 | 29 | @final 30 | class AllocSpec: 31 | def __init__(self, constraints: AllocConstraints, **kwargs: int) -> None: 32 | """ 33 | Initialize a shape with the provided dimension-size pairs. 34 | For example, `AllocSpec(constraints, replica=2, host=3, gpu=8)` creates a 35 | shape with 2 replicas with 3 hosts each, each of which in turn 36 | has 8 GPUs. 37 | """ 38 | ... 39 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/hyperactor_extension/telemetry.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | def forward_to_tracing(message: str, file: str, lineno: int, level: int) -> None: 8 | """ 9 | Log a message with the given metadata. 10 | 11 | Args: 12 | - message (str): The log message. 13 | - file (str): The file where the log message originated. 14 | - lineno (int): The line number where the log message originated. 15 | - level (int): The log level (10 for debug, 20 for info, 30 for warn, 40 for error). 16 | """ 17 | ... 18 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_extension/__init__.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/monarch/2876b0c5f131c092549d68c645d2ba37dfde857a/python/monarch/_rust_bindings/monarch_extension/__init__.pyi -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_extension/debugger.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import final, Optional, Union 8 | 9 | from monarch._rust_bindings.monarch_hyperactor.proc import Serialized 10 | from monarch._rust_bindings.monarch_messages.debugger import ( 11 | DebuggerAction, 12 | DebuggerActionType, 13 | ) 14 | 15 | @final 16 | class DebuggerMessage: 17 | """A message for debugger communication between worker and client.""" 18 | 19 | def __init__(self, action: DebuggerActionType) -> None: 20 | """ 21 | Create a new DebuggerMessage. 22 | 23 | Arguments: 24 | action: The debugger action to include in the message. 25 | """ 26 | ... 27 | 28 | @property 29 | def action(self) -> DebuggerActionType: 30 | """Get the debugger action contained in this message.""" 31 | ... 32 | 33 | def serialize(self) -> Serialized: 34 | """ 35 | Serialize this message for transmission. 36 | 37 | Returns: 38 | A serialized representation of this message. 39 | """ 40 | ... 41 | 42 | @final 43 | class PdbActor: 44 | """An actor for interacting with PDB debugging sessions.""" 45 | 46 | def __init__(self) -> None: 47 | """Create a new PdbActor.""" 48 | ... 49 | 50 | def send(self, action: DebuggerActionType) -> None: 51 | """ 52 | Send a debugger action to the worker. 53 | 54 | Arguments: 55 | action: The debugger action to send. 56 | """ 57 | ... 58 | 59 | def receive(self) -> Optional[DebuggerActionType]: 60 | """ 61 | Receive a debugger action from the worker. 62 | 63 | Returns: 64 | A DebuggerAction if one is available, or None if no action is available. 65 | """ 66 | ... 67 | 68 | def drain_and_stop(self) -> None: 69 | """ 70 | Drain any remaining messages and stop the actor. 71 | """ 72 | ... 73 | 74 | def get_bytes_from_write_action(action: DebuggerAction.Write) -> bytes: 75 | """ 76 | Extract the bytes from the provided write action. 77 | """ 78 | ... 79 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_extension/panic.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | def panicking_function() -> None: ... 8 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_extension/simulator_client.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import final 8 | 9 | @final 10 | class SimulatorClient: 11 | """ 12 | A wrapper around [simulator_client::Simulatorclient] to expose it to python. 13 | It is a client to communicate with the simulator service. 14 | 15 | Arguments: 16 | - `proxy_addr`: Address of the simulator's proxy server. 17 | """ 18 | 19 | def __init__(self, proxy_addr: str) -> None: ... 20 | def kill_world(self, world_name: str) -> None: 21 | """ 22 | Kill the world with the given name. 23 | 24 | Arguments: 25 | - `world_name`: Name of the world to kill. 26 | """ 27 | ... 28 | def spawn_mesh( 29 | self, system_addr: str, controller_actor_id: str, worker_world: str 30 | ) -> None: 31 | """ 32 | Spawn a mesh actor. 33 | 34 | Arguments: 35 | - `system_addr`: Address of the system to spawn the mesh in. 36 | - `controller_actor_id`: Actor id of the controller to spawn the mesh in. 37 | - `worker_world`: World of the worker to spawn the mesh in. 38 | """ 39 | ... 40 | 41 | def set_training_script_state_running(self) -> None: 42 | """ 43 | Let the simulator know that the training script is actively sending 44 | commands to the backend 45 | """ 46 | ... 47 | 48 | def set_training_script_state_waiting(self) -> None: 49 | """ 50 | Let the simulator know that the training script is waiting for the 51 | backend to resolve a future 52 | """ 53 | ... 54 | 55 | def bootstrap_simulator_backend( 56 | system_addr: str, proxy_addr: str, world_size: int 57 | ) -> None: 58 | """ 59 | Bootstrap the simulator backend on the current process 60 | """ 61 | ... 62 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_hyperactor/actor_mesh.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | from typing import final 10 | 11 | from monarch._rust_bindings.monarch_hyperactor.actor import PythonMessage 12 | from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox 13 | 14 | from monarch._rust_bindings.monarch_hyperactor.proc import ActorId 15 | 16 | from monarch._rust_bindings.monarch_hyperactor.shape import Shape 17 | 18 | @final 19 | class PythonActorMesh: 20 | def cast(self, message: PythonMessage) -> None: 21 | """ 22 | Cast a message to this mesh. 23 | """ 24 | 25 | def get(self, rank: int) -> ActorId | None: 26 | """ 27 | Get the actor id for the actor at the given rank. 28 | """ 29 | ... 30 | 31 | @property 32 | def client(self) -> Mailbox: 33 | """ 34 | A client that can be used to communicate with individual 35 | actors in the mesh, and also to create ports that can be 36 | broadcast across the mesh) 37 | """ 38 | ... 39 | 40 | @property 41 | def shape(self) -> Shape: 42 | """ 43 | The Shape object that describes how the rank of an actor 44 | retrieved with get corresponds to coordinates in the 45 | mesh. 46 | """ 47 | ... 48 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_hyperactor/alloc.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | from typing import Optional 10 | 11 | from monarch._rust_bindings.hyperactor_extension.alloc import Alloc, AllocSpec 12 | 13 | class ProcessAllocatorBase: 14 | def __init__( 15 | self, 16 | program: str, 17 | args: Optional[list[str]] = None, 18 | envs: Optional[dict[str, str]] = None, 19 | ) -> None: 20 | """ 21 | Create a new process allocator. 22 | 23 | Arguments: 24 | - `program`: The program for each process to run. Must be a hyperactor 25 | bootstrapped program. 26 | - `args`: The arguments to pass to the program. 27 | - `envs`: The environment variables to set for the program. 28 | """ 29 | ... 30 | 31 | async def allocate_nonblocking(self, spec: AllocSpec) -> Alloc: 32 | """ 33 | Allocate a process according to the provided spec. 34 | 35 | Arguments: 36 | - `spec`: The spec to allocate according to. 37 | """ 38 | ... 39 | 40 | def allocate_blocking(self, spec: AllocSpec) -> Alloc: 41 | """ 42 | Allocate a process according to the provided spec, blocking until an 43 | alloc is returned. 44 | 45 | Arguments: 46 | - `spec`: The spec to allocate according to. 47 | """ 48 | ... 49 | 50 | class LocalAllocatorBase: 51 | async def allocate_nonblocking(self, spec: AllocSpec) -> Alloc: 52 | """ 53 | Allocate a process according to the provided spec. 54 | 55 | Arguments: 56 | - `spec`: The spec to allocate according to. 57 | """ 58 | ... 59 | 60 | def allocate_blocking(self, spec: AllocSpec) -> Alloc: 61 | """ 62 | Allocate a process according to the provided spec, blocking until an 63 | alloc is returned. 64 | 65 | Arguments: 66 | - `spec`: The spec to allocate according to. 67 | """ 68 | ... 69 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_hyperactor/bootstrap.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | def bootstrap_main() -> None: ... 8 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_hyperactor/proc_mesh.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | from typing import final, Type 10 | 11 | from monarch._rust_bindings.hyperactor_extension.alloc import Alloc 12 | from monarch._rust_bindings.monarch_hyperactor.actor_mesh import PythonActorMesh 13 | from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox 14 | from monarch._rust_bindings.monarch_hyperactor.proc import Actor 15 | from monarch._rust_bindings.monarch_hyperactor.shape import Shape 16 | 17 | @final 18 | class ProcMesh: 19 | @classmethod 20 | async def allocate_nonblocking(self, alloc: Alloc) -> ProcMesh: 21 | """ 22 | Allocate a process mesh according to the provided alloc. 23 | Returns when the mesh is fully allocated. 24 | 25 | Arguments: 26 | - `alloc`: The alloc to allocate according to. 27 | """ 28 | ... 29 | 30 | @classmethod 31 | def allocate_blocking(self, alloc: Alloc) -> ProcMesh: 32 | """ 33 | Allocate a process mesh according to the provided alloc. 34 | Blocks until the mesh is fully allocated. 35 | 36 | Arguments: 37 | - `alloc`: The alloc to allocate according to. 38 | """ 39 | ... 40 | 41 | async def spawn_nonblocking(self, name: str, actor: Type[Actor]) -> PythonActorMesh: 42 | """ 43 | Spawn a new actor on this mesh. 44 | 45 | Arguments: 46 | - `name`: Name of the actor. 47 | - `actor`: The type of the actor that will be spawned. 48 | """ 49 | ... 50 | 51 | async def spawn_blocking(self, name: str, actor: Type[Actor]) -> PythonActorMesh: 52 | """ 53 | Spawn a new actor on this mesh. Blocks until the actor is fully spawned. 54 | 55 | Arguments: 56 | - `name`: Name of the actor. 57 | - `actor`: The type of the actor that will be spawned. 58 | """ 59 | ... 60 | 61 | @property 62 | def client(self) -> Mailbox: 63 | """ 64 | A client that can be used to communicate with individual 65 | actors in the mesh, and also to create ports that can be 66 | broadcast across the mesh) 67 | """ 68 | ... 69 | 70 | @property 71 | def shape(self) -> Shape: 72 | """ 73 | The shape of the mesh. 74 | """ 75 | ... 76 | 77 | def __repr__(self) -> str: ... 78 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_hyperactor/runtime.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | """ 10 | Type hints for the runtime module. 11 | """ 12 | 13 | def sleep_indefinitely_for_unit_tests() -> None: 14 | """ 15 | A test function that sleeps indefinitely in a loop. 16 | This is used for testing signal handling in signal_safe_block_on. 17 | The function will sleep forever until interrupted by a signal. 18 | 19 | Raises: 20 | KeyboardInterrupt: When interrupted by a signal like SIGINT 21 | """ 22 | ... 23 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_hyperactor/selection.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import final 8 | 9 | @final 10 | class Selection: 11 | """Opaque representation of a selection expression used to represent 12 | constraints over multidimensional shapes. 13 | 14 | Construct via from_string()` and use with mesh APIs to filter, 15 | evaluate, or route over structured topologies. 16 | """ 17 | def __repr__(self) -> str: ... 18 | @classmethod 19 | def from_string(cls, s: str) -> Selection: 20 | """Parse a selection expression from a string. 21 | 22 | Accepts a compact string syntax such as `"(*, 0:4)"` or `"0 & (1 | 2)"`, 23 | and returns a structured Selection object. 24 | 25 | Raises: 26 | ValueError: if the input string is not a valid selection expression. 27 | """ 28 | ... 29 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_messages/debugger.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import final, Union 8 | 9 | @final 10 | class DebuggerAction: 11 | """Enum representing actions for the debugger communication between worker and client.""" 12 | 13 | class Paused: 14 | """ 15 | Sent from worker to client to indicate that the worker has entered 16 | a pdb debugging session. 17 | """ 18 | 19 | pass 20 | 21 | class Attach: 22 | """ 23 | Sent from client to worker to indicate that the client has started 24 | the debugging session. 25 | """ 26 | 27 | pass 28 | 29 | class Detach: 30 | """Sent to client or to worker to end the debugging session.""" 31 | 32 | pass 33 | 34 | class Write: 35 | """Sent to client or to worker to write bytes to receiver's stdout.""" 36 | 37 | def __init__(self, bytes: bytes) -> None: ... 38 | 39 | class Read: 40 | """Sent from worker to client to read bytes from client's stdin.""" 41 | 42 | def __init__(self, requested_size: int) -> None: ... 43 | @property 44 | def requested_size(self) -> int: 45 | """Get the number of bytes to read from stdin.""" 46 | ... 47 | 48 | DebuggerActionType = Union[ 49 | DebuggerAction.Paused, 50 | DebuggerAction.Attach, 51 | DebuggerAction.Detach, 52 | DebuggerAction.Read, 53 | DebuggerAction.Write, 54 | ] 55 | -------------------------------------------------------------------------------- /python/monarch/_rust_bindings/monarch_tensor_worker/bootstrap.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import final, Optional, Tuple 8 | 9 | class WorkerServerRequest: 10 | """ 11 | Python binding for the Rust WorkerServerRequest enum. 12 | """ 13 | 14 | @final 15 | class Run(WorkerServerRequest): 16 | """ 17 | Create a Run request variant. 18 | 19 | Args: 20 | world_id: The ID of the world 21 | proc_id: The ID of the process 22 | bootstrap_addr: The bootstrap address 23 | 24 | Returns: 25 | A WorkerServerRequest.Run instance 26 | """ 27 | def __init__( 28 | self, 29 | *, 30 | world_id: str, 31 | proc_id: str, 32 | bootstrap_addr: str, 33 | labels: list[Tuple[str, str]], 34 | ) -> None: ... 35 | 36 | @final 37 | class Exit(WorkerServerRequest): 38 | """ 39 | Create an Exit request variant. 40 | 41 | Returns: 42 | A WorkerServerRequest.Exit instance 43 | """ 44 | 45 | pass 46 | 47 | def to_json(self) -> str: 48 | """ 49 | Convert this request to a JSON string. 50 | 51 | Returns: 52 | A JSON string representation of this request 53 | 54 | Raises: 55 | Exception: If serialization fails 56 | """ 57 | pass 58 | 59 | class WorkerServerResponse: 60 | """ 61 | Python binding for the Rust WorkerServerResponse enum. 62 | """ 63 | 64 | @final 65 | class Finished(WorkerServerResponse): 66 | """ 67 | Create a Finished response variant. 68 | 69 | Args: 70 | error: An optional error message if the operation failed 71 | 72 | Returns: 73 | A WorkerServerResponse.Finished instance 74 | """ 75 | 76 | error: Optional[str] 77 | 78 | @classmethod 79 | def from_json(cls, json: str) -> "WorkerServerResponse": 80 | """ 81 | Create a WorkerServerResponse from a JSON string. 82 | 83 | Args: 84 | json: A JSON string representation of a WorkerServerResponse 85 | 86 | Returns: 87 | The deserialized WorkerServerResponse 88 | 89 | Raises: 90 | Exception: If deserialization fails 91 | """ 92 | pass 93 | -------------------------------------------------------------------------------- /python/monarch/allocator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import final 8 | 9 | from monarch import ActorFuture as Future 10 | from monarch._rust_bindings.hyperactor_extension.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension 11 | Alloc, 12 | AllocSpec, 13 | ) 14 | 15 | from monarch._rust_bindings.monarch_hyperactor.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension 16 | LocalAllocatorBase, 17 | ProcessAllocatorBase, 18 | ) 19 | 20 | 21 | @final 22 | class ProcessAllocator(ProcessAllocatorBase): 23 | """ 24 | An allocator that allocates by spawning local processes. 25 | """ 26 | 27 | def allocate(self, spec: AllocSpec) -> Future[Alloc]: 28 | """ 29 | Allocate a process according to the provided spec. 30 | 31 | Arguments: 32 | - `spec`: The spec to allocate according to. 33 | 34 | Returns: 35 | - A future that will be fulfilled when the requested allocation is fulfilled. 36 | """ 37 | return Future( 38 | lambda: self.allocate_nonblocking(spec), 39 | lambda: self.allocate_blocking(spec), 40 | ) 41 | 42 | 43 | @final 44 | class LocalAllocator(LocalAllocatorBase): 45 | """ 46 | An allocator that allocates by spawning actors into the current process. 47 | """ 48 | 49 | def allocate(self, spec: AllocSpec) -> Future[Alloc]: 50 | """ 51 | Allocate a process according to the provided spec. 52 | 53 | Arguments: 54 | - `spec`: The spec to allocate according to. 55 | 56 | Returns: 57 | - A future that will be fulfilled when the requested allocation is fulfilled. 58 | """ 59 | return Future( 60 | lambda: self.allocate_nonblocking(spec), 61 | lambda: self.allocate_blocking(spec), 62 | ) 63 | -------------------------------------------------------------------------------- /python/monarch/builtins/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | """ 9 | Builtins for Monarch is a set of remote function defintions for PyTorch functions and other utilities. 10 | """ 11 | 12 | from .log import log_remote, set_logging_level_remote 13 | 14 | __all__ = ["log_remote", "set_logging_level_remote"] 15 | -------------------------------------------------------------------------------- /python/monarch/builtins/log.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import logging 8 | 9 | from monarch.common.remote import remote 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | @remote(propagate="inspect") 16 | def log_remote(*args, level: int = logging.WARNING, **kwargs) -> None: 17 | logger.log(level, *args, **kwargs) 18 | 19 | 20 | @remote(propagate="inspect") 21 | def set_logging_level_remote(level: int) -> None: 22 | logger.setLevel(level) 23 | -------------------------------------------------------------------------------- /python/monarch/builtins/random.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre strict 8 | from typing import Callable 9 | 10 | import torch 11 | from monarch.common.remote import remote 12 | 13 | 14 | @remote(propagate="inspect") 15 | def set_manual_seed_remote(seed: int, process_idx: int = 0) -> None: 16 | torch.manual_seed(seed ^ process_idx) 17 | 18 | 19 | @remote(propagate=lambda: 0) 20 | def initial_seed_remote() -> int: 21 | return torch.initial_seed() 22 | 23 | 24 | @remote(propagate=lambda: torch.zeros(1)) 25 | def get_rng_state_remote() -> torch.Tensor: 26 | return torch.get_rng_state() 27 | 28 | 29 | @remote(propagate="inspect") 30 | def set_rng_state_remote(new_state: torch.Tensor) -> None: 31 | torch.set_rng_state(new_state) 32 | 33 | 34 | def _run_no_return(f: Callable) -> None: 35 | f() 36 | return None 37 | 38 | 39 | # TODO: return result when uint64 is supported from remote function 40 | @remote(propagate=lambda: _run_no_return(torch.seed)) 41 | def seed_remote() -> None: 42 | torch.seed() 43 | 44 | 45 | # same underlying implementation as seed_remote (torch.seed) 46 | # TODO: return result when uint64 is supported from remote function 47 | @remote(propagate=lambda: _run_no_return(torch.random.seed)) 48 | def random_seed_remote() -> None: 49 | torch.random.seed() 50 | 51 | 52 | @remote(propagate="inspect") 53 | def manual_seed_cuda_remote(seed: int) -> None: 54 | torch.cuda.manual_seed(seed) 55 | 56 | 57 | @remote(propagate="inspect") 58 | def manual_seed_all_cuda_remote(seed: int) -> None: 59 | torch.cuda.manual_seed_all(seed) 60 | 61 | 62 | @remote(propagate=lambda: [torch.zeros(1)]) 63 | def get_rng_state_all_cuda_remote() -> list[torch.Tensor]: 64 | return torch.cuda.get_rng_state_all() 65 | 66 | 67 | @remote(propagate="inspect") 68 | def set_rng_state_all_cuda_remote(states: list[torch.Tensor]) -> None: 69 | torch.cuda.set_rng_state_all(states) 70 | -------------------------------------------------------------------------------- /python/monarch/common/_C.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | def patch_cuda() -> None: ... 10 | def mock_cuda() -> None: ... 11 | def unmock_cuda() -> None: ... 12 | -------------------------------------------------------------------------------- /python/monarch/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/monarch/2876b0c5f131c092549d68c645d2ba37dfde857a/python/monarch/common/__init__.py -------------------------------------------------------------------------------- /python/monarch/common/_device_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import os 8 | import re 9 | from pathlib import Path 10 | 11 | 12 | def _local_device_count(): 13 | if "CUDA_VISIBLE_DEVICES" in os.environ: 14 | return len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) 15 | dev_path = Path("/dev") 16 | pattern = re.compile(r"nvidia\d+$") 17 | nvidia_devices = [dev for dev in dev_path.iterdir() if pattern.match(dev.name)] 18 | return len(nvidia_devices) 19 | -------------------------------------------------------------------------------- /python/monarch/common/base_tensor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | import torch 9 | 10 | 11 | # All of the tensor examples in this zoo inherit from BaseTensor. Ideally, 12 | # however, they would inherit directly from Tensor. This is just our staging 13 | # ground for applying behavior that hasn't yet made it into core but that 14 | # we would like to apply by default. 15 | class BaseTensor(torch.Tensor): 16 | # See https://github.com/pytorch/pytorch/pull/73727 ; this is necessary 17 | # to ensure that super().__new__ can cooperate with each other 18 | @staticmethod 19 | def __new__(cls, elem, *, requires_grad=None): 20 | if requires_grad is None: 21 | return super().__new__(cls, elem) 22 | else: 23 | return cls._make_subclass(cls, elem, requires_grad) 24 | 25 | # If __torch_dispatch__ is defined (which it will be for all our examples) 26 | # the default torch function implementation (which preserves subclasses) 27 | # typically must be disabled 28 | __torch_function__ = torch._C._disabled_torch_function_impl 29 | -------------------------------------------------------------------------------- /python/monarch/common/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | SIM_MESH_CLIENT_TIMEOUT = 5 10 | SIM_MESH_CLIENT_SUPERVISION_UPDATE_INTERVAL = 5 11 | -------------------------------------------------------------------------------- /python/monarch/common/context_manager.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | from functools import wraps 9 | 10 | 11 | class _ContextManager: 12 | def __init__(self, generator): 13 | self.generator = generator 14 | self.generator.send(None) 15 | 16 | def __enter__(self): 17 | return 18 | 19 | def __exit__(self, *args): 20 | try: 21 | self.generator.send(None) 22 | except StopIteration: 23 | pass 24 | else: 25 | raise RuntimeError("context manager generator did not exit") 26 | 27 | 28 | def activate_first_context_manager(func): 29 | """ 30 | Similar to contextlib.contextmanager but it 31 | starts the context when the function is called rather than 32 | than at the start of the with statement. Useful for things where 33 | you want to optionally activate the context without a guard. 34 | """ 35 | 36 | @wraps(func) 37 | def helper(*args, **kwargs): 38 | return _ContextManager(func(*args, **kwargs)) 39 | 40 | return helper 41 | -------------------------------------------------------------------------------- /python/monarch/common/fake.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | from concurrent.futures import ThreadPoolExecutor 9 | from functools import cache 10 | 11 | from torch._subclasses.fake_tensor import FakeTensorMode 12 | 13 | 14 | @cache 15 | def _fake_mode_worker(): 16 | return ThreadPoolExecutor(max_workers=1) 17 | 18 | 19 | @cache 20 | def _fake_mode(): 21 | return FakeTensorMode() 22 | 23 | 24 | def fake_call(fn, *args, **kwargs): 25 | """Execute on work on a ThreadPool worker 26 | 27 | First call (ThreadPoolExecutor init) will take the GIL and may block for long time! 28 | TODO: this will be replaced with something more performant 29 | """ 30 | global _fake_mode_worker, fake_mode 31 | 32 | # # Calls FakeTensorMode while re-enabling version counter tracking 33 | # # todo(chilli): I'm not totally sure why I need to disable python dispatch 34 | # # key. Perhaps there's some unwrapping that should have happened further up. 35 | # include_to_set = torch._C._dispatch_tls_local_include_set() 36 | # exclude_to_set = ( 37 | # torch._C._dispatch_tls_local_exclude_set() 38 | # | torch._C.DispatchKeySet(torch._C.DispatchKey.Python) 39 | # ) - torch._C.DispatchKeySet(torch._C.DispatchKey.ADInplaceOrView) 40 | 41 | # def work(): 42 | # with torch._C._ForceDispatchKeyGuard(include_to_set, exclude_to_set): 43 | # with fake_mode: 44 | # return fn(*args, **kwargs) 45 | 46 | # return work() 47 | 48 | def work(): 49 | # fake mode must be initialized in the worker thread 50 | # otherwise a monarch dispatch mode may be active, causing 51 | # FakeTensorMode to initialize wrong. 52 | with _fake_mode(): 53 | return fn(*args, **kwargs) 54 | 55 | return _fake_mode_worker().submit(work).result() 56 | -------------------------------------------------------------------------------- /python/monarch/common/init.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include 10 | // @lint-ignore CLANGTIDY facebook-hte-RelativeInclude 11 | #include "mock_cuda.h" 12 | 13 | static PyMethodDef _C_methods[] = { 14 | {"patch_cuda", 15 | patch_cuda, 16 | METH_NOARGS, 17 | "Initialize the monarch cuda patch."}, 18 | {"mock_cuda", mock_cuda, METH_NOARGS, "Enable cuda mocking."}, 19 | {"unmock_cuda", unmock_cuda, METH_NOARGS, "Disable cuda mocking."}, 20 | {NULL, NULL, 0, NULL}}; 21 | 22 | static struct PyModuleDef _C_module = { 23 | PyModuleDef_HEAD_INIT, 24 | "_C", 25 | "A module containing monarch C++ functionality.", 26 | -1, 27 | _C_methods, 28 | NULL, 29 | NULL, 30 | NULL, 31 | NULL}; 32 | 33 | PyMODINIT_FUNC PyInit__C(void) { 34 | return PyModule_Create(&_C_module); 35 | } 36 | -------------------------------------------------------------------------------- /python/monarch/common/mock_cuda.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | 13 | PyObject* patch_cuda(PyObject*, PyObject*); 14 | PyObject* mock_cuda(PyObject*, PyObject*); 15 | PyObject* unmock_cuda(PyObject*, PyObject*); 16 | -------------------------------------------------------------------------------- /python/monarch/common/mock_cuda.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | from contextlib import contextmanager 9 | from typing import Generator, Optional 10 | 11 | import monarch.common._C # @manual=//monarch/python/monarch/common:_C 12 | import torch 13 | 14 | monarch.common._C.patch_cuda() 15 | 16 | _mock_cuda_stream: Optional[torch.cuda.Stream] = None 17 | 18 | 19 | def get_mock_cuda_stream() -> torch.cuda.Stream: 20 | global _mock_cuda_stream 21 | if _mock_cuda_stream is None: 22 | _mock_cuda_stream = torch.cuda.Stream() 23 | return _mock_cuda_stream 24 | 25 | 26 | @contextmanager 27 | def mock_cuda_guard() -> Generator[None, None, None]: 28 | try: 29 | with torch.cuda.stream(get_mock_cuda_stream()): 30 | monarch.common._C.mock_cuda() 31 | yield 32 | finally: 33 | monarch.common._C.unmock_cuda() 34 | 35 | 36 | def mock_cuda() -> None: 37 | monarch.common._C.mock_cuda() 38 | 39 | 40 | def unmock_cuda() -> None: 41 | monarch.common._C.unmock_cuda() 42 | -------------------------------------------------------------------------------- /python/monarch/common/pickle_flatten.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import io 8 | import pickle 9 | from typing import Any, Callable, Iterable, List, Tuple 10 | 11 | import cloudpickle 12 | 13 | 14 | class _Pickler(cloudpickle.Pickler): 15 | def __init__(self, filter): 16 | self.f = io.BytesIO() 17 | super().__init__(self.f) 18 | self._filter = filter 19 | self._saved = [] 20 | 21 | def persistent_id(self, obj): 22 | if not self._filter(obj): 23 | return None 24 | self._saved.append(obj) 25 | return len(self._saved) - 1 26 | 27 | 28 | class _Unpickler(pickle.Unpickler): 29 | def __init__(self, data, sequence: Iterable[Any]): 30 | super().__init__(io.BytesIO(data)) 31 | self._iter = iter(sequence) 32 | self._values = [] 33 | 34 | def persistent_load(self, id): 35 | while id >= len(self._values): 36 | self._values.append(next(self._iter)) 37 | return self._values[id] 38 | 39 | 40 | def flatten(obj: Any, filter: Callable[[Any], bool]) -> Tuple[List[Any], bytes]: 41 | pickler = _Pickler(filter) 42 | pickler.dump(obj) 43 | return pickler._saved, pickler.f.getvalue() 44 | 45 | 46 | def unflatten(data: bytes, values: Iterable[Any]) -> Any: 47 | up = _Unpickler(data, values) 48 | return up.load() 49 | -------------------------------------------------------------------------------- /python/monarch/common/process_group.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | 9 | import logging 10 | 11 | import torch.distributed as dist 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def _wrap_method(process_group: dist.ProcessGroup, method): 17 | def wrapper(*args, **kwargs): 18 | logger.debug( 19 | "ProcessGroup Call: %s with args %s and kwargs %s", method, args, kwargs 20 | ) 21 | fn = getattr(process_group, method) 22 | try: 23 | return fn(*args, **kwargs) 24 | except Exception as e: 25 | logger.warning( 26 | "ProcessGroup Call: %s with args %s and kwargs %s failed with exception: %s", 27 | method, 28 | args, 29 | kwargs, 30 | str(e), 31 | ) 32 | # TODO(rajeshn): send a message back to the controller that this 33 | # worker had a failed communication event 34 | raise e 35 | 36 | return wrapper 37 | 38 | 39 | class SingleControllerProcessGroupWrapper: 40 | """ 41 | Wraps a ProcessGroup object to provide a single controller process group. This provides us a hook to observe 42 | all the operatons on the process group to the controller. 43 | """ 44 | 45 | def __new__(cls, pg: dist.ProcessGroup): 46 | instance = super().__new__(cls) 47 | 48 | for attr in dir(type(pg)): 49 | if not attr.startswith("__") and callable(getattr(type(pg), attr)): 50 | setattr(instance, attr, _wrap_method(pg, attr)) 51 | 52 | return instance 53 | 54 | def __init__(self, process_group): 55 | self.process_group = process_group 56 | -------------------------------------------------------------------------------- /python/monarch/common/reference.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | from typing import Optional 9 | 10 | from monarch._rust_bindings.monarch_extension.tensor_worker import Ref 11 | 12 | 13 | class Referenceable: 14 | def __init__(self): 15 | self.ref: Optional[int] = None 16 | 17 | def delete_ref(self, ref): 18 | raise NotImplementedError("no delete_ref method") 19 | 20 | def __reduce_ex__(self, protocol): 21 | assert ( 22 | self.ref is not None 23 | ), f"{self} is being sent but does not have a reference" 24 | return Ref, (self.ref,) 25 | 26 | # Used by rust backend to get the ref for this object 27 | def __monarch_ref__(self) -> int: 28 | assert self.ref is not None 29 | return self.ref 30 | 31 | def __del__(self): 32 | if self.ref is not None: 33 | self.delete_ref(self.ref) 34 | -------------------------------------------------------------------------------- /python/monarch/common/selection.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from monarch._rust_bindings.monarch_hyperactor.selection import Selection 8 | 9 | __all__ = ["Selection"] 10 | -------------------------------------------------------------------------------- /python/monarch/common/tensor_factory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | from typing import NamedTuple, Tuple 9 | 10 | import torch 11 | 12 | 13 | class TensorFactory(NamedTuple): 14 | size: Tuple[int, ...] 15 | dtype: torch.dtype 16 | layout: torch.layout 17 | device: torch.device 18 | 19 | @staticmethod 20 | def from_tensor(t): 21 | return TensorFactory(t.size(), t.dtype, t.layout, t.device) 22 | 23 | def empty(self): 24 | return torch.empty( 25 | self.size, dtype=self.dtype, layout=self.layout, device=self.device 26 | ) 27 | 28 | def zeros(self): 29 | return torch.full( 30 | self.size, 0, dtype=self.dtype, layout=self.layout, device=self.device 31 | ) 32 | -------------------------------------------------------------------------------- /python/monarch/common/tree.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | from typing import Any, Callable, Protocol, Sequence, Tuple 9 | 10 | import torch.utils._pytree as _pytree 11 | from torch.utils._pytree import ( 12 | _get_node_type, 13 | register_pytree_node, 14 | SUPPORTED_NODES, 15 | tree_flatten, 16 | tree_map, 17 | tree_unflatten, 18 | ) 19 | 20 | 21 | def flatten(tree, cond): 22 | r, spec = tree_flatten(tree) 23 | 24 | # be careful to not capture values we return in 25 | # 'trues'. We do not need them to reconstruct and do not want to 26 | # extend their lifetime. 27 | trues = [] 28 | falses = [] 29 | conds = [] 30 | for e in r: 31 | c = cond(e) 32 | (trues if c else falses).append(e) 33 | conds.append(c) 34 | 35 | def unflatten(n): 36 | n_it = iter(n) 37 | falses_it = iter(falses) 38 | return tree_unflatten([next(n_it if c else falses_it) for c in conds], spec) 39 | 40 | return trues, unflatten 41 | 42 | 43 | def flattener(tree, cond=None): 44 | """ 45 | Produce a _traceable_ flattener routine from tree. That is, it produces code that can 46 | flatten another object shaped the same as tree, but whose structure cannot 47 | be introspected because it might be (e.g.) an fx proxy value. 48 | """ 49 | if isinstance(tree, (tuple, list)): 50 | flattens = [flattener(t, cond) for t in tree] 51 | return lambda obj: [ 52 | f for i, flatten in enumerate(flattens) for f in flatten(obj[i]) 53 | ] 54 | elif isinstance(tree, dict): 55 | keys = tuple(tree.keys()) 56 | flattens = [flattener(t, cond) for t in tree.values()] 57 | return lambda obj: [ 58 | f for k, flatten in zip(keys, flattens) for f in flatten(obj[k]) 59 | ] 60 | elif _get_node_type(tree) in SUPPORTED_NODES: 61 | flatten_fn = SUPPORTED_NODES[_get_node_type(tree)].flatten_fn 62 | trees, _ = flatten_fn(tree) 63 | flattens = [flattener(t, cond) for t in trees] 64 | 65 | def the_flattener(obj): 66 | trees, _ = flatten_fn(obj) 67 | return [f for i, flatten in enumerate(flattens) for f in flatten(trees[i])] 68 | 69 | return the_flattener 70 | elif cond is None or cond(tree): 71 | return lambda obj: [obj] 72 | else: 73 | return lambda obj: [] 74 | -------------------------------------------------------------------------------- /python/monarch/controller/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | -------------------------------------------------------------------------------- /python/monarch/controller/debugger.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | import sys 9 | from typing import Optional 10 | 11 | _is_ipython: Optional[bool] = None 12 | 13 | 14 | def is_ipython() -> bool: 15 | global _is_ipython 16 | if _is_ipython is not None: 17 | return _is_ipython 18 | try: 19 | from IPython import get_ipython 20 | 21 | _is_ipython = get_ipython() is not None 22 | except ImportError: 23 | _is_ipython = False 24 | return _is_ipython 25 | 26 | 27 | def write(msg: str) -> None: 28 | sys.stdout.write(msg) 29 | sys.stdout.flush() 30 | 31 | 32 | def read(requested_size: int) -> bytes: 33 | if not is_ipython(): 34 | b = bytearray(requested_size) 35 | bytes_read = sys.stdin.buffer.raw.readinto(b) 36 | return bytes(b[:bytes_read]) 37 | 38 | # ipython doesn't have stdin directly connected 39 | # so we need to use input() instead. 40 | user_input = input() + "\n" 41 | input_bytes = user_input.encode("utf-8") 42 | num_bytes_to_write = len(input_bytes) 43 | if requested_size < num_bytes_to_write: 44 | raise RuntimeError( 45 | f"Debugger input line too long, max length is {requested_size}" 46 | ) 47 | return input_bytes[:num_bytes_to_write] 48 | -------------------------------------------------------------------------------- /python/monarch/controller/rust_backend/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | -------------------------------------------------------------------------------- /python/monarch/fetch.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | """ 9 | This is a utility file for fetching a shard of a tensor from remote. 10 | """ 11 | 12 | from typing import TypeVar 13 | 14 | from monarch.common.device_mesh import no_mesh 15 | 16 | from monarch.common.future import Future 17 | 18 | from monarch.common.remote import _call_on_shard_and_fetch 19 | 20 | T = TypeVar("T") 21 | 22 | 23 | def fetch_shard( 24 | obj: T, shard: dict[str, int] | None = None, **kwargs: int 25 | ) -> Future[T]: 26 | """ 27 | Retrieve the shard at `coordinates` of the current device mesh of each 28 | tensor in obj. All tensors in `obj` will be fetched to the CPU device. 29 | obj - a pytree containing the tensors the fetch 30 | shard - a dictionary from mesh dimension name to coordinate of the shard 31 | If None, this will fetch from coordinate 0 for all dimensions (useful after all_reduce/all_gather) 32 | preprocess - a 33 | **kwargs - additional keyword arguments are added as entries to the shard dictionary 34 | """ 35 | if kwargs: 36 | if shard is None: 37 | shard = {} 38 | shard.update(kwargs) 39 | 40 | return _call_on_shard_and_fetch( 41 | None, lambda *args, **kwargs: None, obj, shard=shard 42 | ) 43 | 44 | 45 | def show(obj: T, shard: dict[str, int] | None = None, **kwargs: int) -> object: 46 | v = inspect(obj, shard=shard, **kwargs) 47 | # pyre-ignore 48 | from torchshow import show # @manual 49 | 50 | with no_mesh.activate(): 51 | return show(v) 52 | 53 | 54 | def inspect(obj: T, shard: dict[str, int] | None = None, **kwargs: int) -> T: 55 | return fetch_shard(obj, shard=shard, **kwargs).result() 56 | -------------------------------------------------------------------------------- /python/monarch/future.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import asyncio 8 | from typing import Generator, Generic, TypeVar 9 | 10 | R = TypeVar("R") 11 | 12 | 13 | # TODO: consolidate with monarch.common.future 14 | class ActorFuture(Generic[R]): 15 | def __init__(self, impl, blocking_impl=None): 16 | self._impl = impl 17 | self._blocking_impl = blocking_impl 18 | 19 | def get(self) -> R: 20 | if self._blocking_impl is not None: 21 | return self._blocking_impl() 22 | return asyncio.run(self._impl()) 23 | 24 | def __await__(self) -> Generator[R, None, R]: 25 | return self._impl().__await__() 26 | -------------------------------------------------------------------------------- /python/monarch/gradient/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | from ._gradient_generator import GradientGenerator 10 | 11 | __all__ = ["GradientGenerator"] 12 | -------------------------------------------------------------------------------- /python/monarch/gradient/_gradient_generator.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | from typing import Any, Optional 9 | 10 | import torch 11 | 12 | class GradientGenerator: 13 | def __init__( 14 | self, 15 | roots_list: Any, 16 | with_respect_to: Any, 17 | grad_roots: Any, 18 | context_restorer: Any, 19 | ): ... 20 | # pyre-ignore[11]: Annotation `torch.Tensor` is not defined as a type. 21 | def __next__(self) -> Optional[torch.Tensor]: ... 22 | def __iter__(self) -> "GradientGenerator": ... 23 | -------------------------------------------------------------------------------- /python/monarch/memory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | import itertools 9 | import os 10 | from pathlib import Path 11 | 12 | import torch 13 | from monarch.common.remote import remote 14 | 15 | 16 | PATH_KEY = "dir_snapshots" 17 | _counter = itertools.count() 18 | 19 | 20 | @remote(propagate="inspect") 21 | def record_memory_history() -> None: 22 | torch.cuda.memory._record_memory_history() 23 | 24 | 25 | def dump_memory_snapshot(*args, **kwargs) -> None: 26 | """ 27 | This function wraps torch.cuda.memory._dump_snapshot() to dump memory snapshot remotely. 28 | """ 29 | assert isinstance( 30 | kwargs.get(PATH_KEY, None), str 31 | ), f"{PATH_KEY} must be passed and must be a string to represent the path to save the memory snapshots." 32 | id = next(_counter) 33 | _memory_controller_dump(id, *args, **kwargs) 34 | 35 | 36 | @remote(propagate="inspect") 37 | def _memory_controller_dump(ident, *args, **kwargs) -> None: 38 | dir_path = Path(kwargs[PATH_KEY]).absolute() 39 | os.makedirs(dir_path, exist_ok=True) 40 | # This is not a synchronized call, so it is okay to call without device mesh. 41 | rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0 42 | snapshot_path = f"{dir_path}/snapshot_{rank}.pickle" 43 | torch.cuda.memory._dump_snapshot(filename=snapshot_path) 44 | -------------------------------------------------------------------------------- /python/monarch/parallel/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from monarch.parallel.pipelining.runtime import get_parameter_udf, PipelineParallelism 8 | 9 | __all__ = ["PipelineParallelism", "get_parameter_udf"] 10 | -------------------------------------------------------------------------------- /python/monarch/parallel/pipelining/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | -------------------------------------------------------------------------------- /python/monarch/random.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import os 8 | from typing import NamedTuple, Tuple 9 | 10 | import torch 11 | from monarch.common.remote import remote 12 | from monarch.common.tensor import Tensor 13 | 14 | 15 | class State(NamedTuple): 16 | cpu: Tensor 17 | cuda: Tensor 18 | 19 | 20 | @remote( 21 | propagate=lambda: ( 22 | torch.empty(5056, dtype=torch.uint8), 23 | torch.empty(16, dtype=torch.uint8), 24 | ) 25 | ) 26 | def _get_state() -> Tuple[torch.Tensor, torch.Tensor]: 27 | return (torch.get_rng_state(), torch.cuda.get_rng_state()) 28 | 29 | 30 | @remote(propagate=lambda state: None) 31 | def set_state(state: Tuple[Tensor, Tensor]): 32 | cpu, device = state 33 | torch.set_rng_state(cpu) 34 | torch.cuda.set_rng_state(device) 35 | 36 | 37 | @remote(propagate=lambda _: None) 38 | def _manual_seed(seed: torch.Tensor): 39 | torch.manual_seed(seed.item()) 40 | 41 | 42 | @remote(propagate=lambda: None) 43 | def make_deterministic(): 44 | torch.use_deterministic_algorithms(True) 45 | torch.backends.cudnn.deterministic = True 46 | torch.backends.cudnn.benchmark = False 47 | # env var for deterministic CuBLAS 48 | # https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html 49 | os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" 50 | 51 | 52 | def get_state() -> State: 53 | return State(*_get_state()) 54 | 55 | 56 | def new_state(seed: Tensor) -> State: 57 | orig = get_state() 58 | _manual_seed(seed) 59 | mine = get_state() 60 | set_state(orig) 61 | return mine 62 | -------------------------------------------------------------------------------- /python/monarch/simulator/README.md: -------------------------------------------------------------------------------- 1 | ## Brief Introduction 2 | The Simulator can act as a backend, similar to ProcessBackend, or as a standalone object to receive messages from a pre-stored file. Its primary function is to simulate the execution time and memory usage based on the input messages. 3 | 4 | ### Execution model 5 | The Simulator consists of multiple worker objects, each maintaining several stream objects. The Simulator forwards messages to the workers, which in turn forward them to the streams. A Stream object maintains a task queue and executes the first task when its dependencies are fulfilled. The task will be marked as finished immediately after executing if it is a computation op. If the task is a collective op, it will only be marked as finished after all other tasks participating in the collective op have been executed. A trace event will be created for the task after it is finished. 6 | 7 | ### Memory model 8 | Currently, only GPU memory is recorded. A GPU tensor must be created by some task, so a GPU tensor is created when a task is created. However, its memory will only be allocated after the task is executed. To avoid double-counting the memory usage of tensors that share the same storage, a WorkerStorageTracker is used to track unique storage. The memory usage is increased only when a new storage is created, and decreased only when an existing storage is deleted. The memory usage of a storage is attributed to the stream that creates the storage. 9 | 10 | ## Current Status, Implemented Features 11 | * Concurrent Task Execution: Traces concurrent tasks across different streams and workers, including collective operations. 12 | * Memory Tracking: Traces memory usage without overcounting, particularly for views. 13 | * Controller Message Tracing: Logs messages from the controller for better oversight and debugging. 14 | 15 | ## Pending Features 16 | * Deduplication: Many workers behave the same. The Simulator should group them to make the trace easier to read and the simulation faster. 17 | * Profiling: The current runtime of each op is hardcoded and incorrect. The Simulator should take the profiling result as data to simulate. We would need a feature to support cached propagation of remote functions. 18 | * Remote Function: The Simulator will fail with the new cache propagation remote function or cause a hang. 19 | * Fetch Shard: Not implemented yet. 20 | * Trace of CPU operations: The current design assumes CPU ops have zero overheads, so CPU tensors will just be created without taking time. This is not accurate and can be an issue if users perform optimizer CPU offloading. 21 | -------------------------------------------------------------------------------- /python/monarch/simulator/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | -------------------------------------------------------------------------------- /python/monarch/simulator/config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | import contextlib 9 | 10 | META_VAL = [] 11 | 12 | 13 | @contextlib.contextmanager 14 | def set_meta(new_value): 15 | # Sets the metadata for any tasks created under this 16 | global META_VAL 17 | META_VAL.append(new_value) 18 | try: 19 | yield 20 | finally: 21 | META_VAL.pop() 22 | -------------------------------------------------------------------------------- /python/monarch/simulator/interface.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Union 8 | 9 | from monarch.common.client import Client as _Client 10 | from monarch.common.device_mesh import DeviceMesh 11 | from monarch.common.shape import NDSlice 12 | 13 | from monarch.simulator.ir import IRGraph 14 | from monarch.simulator.simulator import ( 15 | SimulatorBackendMode, 16 | SimulatorController as _SimulatorController, 17 | SimulatorInterface, 18 | SimulatorTraceMode, 19 | ) 20 | 21 | 22 | def Simulator( 23 | hosts: int, 24 | gpus: int, 25 | *, 26 | simulate_mode: Union["str", SimulatorBackendMode] = SimulatorBackendMode.SIMULATE, 27 | trace_mode: Union["str", SimulatorTraceMode] = SimulatorTraceMode.STREAM_ONLY, 28 | upload_trace: bool = False, 29 | trace_path: str = "trace.json", 30 | command_history_path: str = "command_history.pkl", 31 | group_workers: bool = False, 32 | build_ir: bool = False, 33 | ) -> "SimulatorInterface": 34 | if isinstance(simulate_mode, str): 35 | simulate_mode = getattr(SimulatorBackendMode, simulate_mode.upper()) 36 | if isinstance(trace_mode, str): 37 | trace_mode = getattr(SimulatorTraceMode, trace_mode.upper()) 38 | 39 | ir = IRGraph() if build_ir else None 40 | ctrl = _SimulatorController( 41 | hosts * gpus, 42 | gpu_per_host=gpus, 43 | simulate_mode=simulate_mode, 44 | trace_mode=trace_mode, 45 | upload_trace=upload_trace, 46 | trace_path=trace_path, 47 | command_history_path=command_history_path, 48 | group_workers=group_workers, 49 | ir=ir, 50 | ) 51 | client = _Client(ctrl, ctrl.world_size, ctrl.gpu_per_host) 52 | dm = DeviceMesh( 53 | client, 54 | NDSlice(offset=0, sizes=[hosts, gpus], strides=[gpus, 1]), 55 | ("host", "gpu"), 56 | ) 57 | 58 | dm.exit = lambda: client.shutdown() 59 | return SimulatorInterface(dm, ctrl, ir) 60 | -------------------------------------------------------------------------------- /python/monarch/simulator/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | import os 9 | 10 | import numpy as np 11 | 12 | 13 | def file_path_with_iter(file_path: str, iter_count: int) -> str: 14 | dir_path = os.path.dirname(file_path) 15 | file_name, file_postfix = os.path.basename(file_path).split(".") 16 | file_name = f"{file_name}_{iter_count}.{file_postfix}" 17 | return os.path.join(dir_path, file_name) 18 | 19 | 20 | def compress_workers_range(workers) -> str: 21 | regions = [] 22 | start = workers[0] 23 | end = workers[0] 24 | sorted_workers = np.sort(workers) 25 | for i in range(1, len(sorted_workers)): 26 | if workers[i] == end + 1: 27 | end = workers[i] 28 | else: 29 | regions.append(f"[{start}-{end}]") 30 | start = workers[i] 31 | end = workers[i] 32 | regions.append(f"[{start}-{end}]") 33 | return " ".join(regions) 34 | 35 | 36 | def clean_name(name: str) -> str: 37 | if name.startswith("torch.ops.aten."): 38 | name = name[len("torch.ops.") :] # noqa: whitespace before ':' 39 | if name.endswith(".default"): 40 | name = name[: -len(".default")] 41 | return name 42 | -------------------------------------------------------------------------------- /python/monarch/timer/README.md: -------------------------------------------------------------------------------- 1 | # CUDA Timer 2 | 3 | This folder contains a lightweight CUDA timer utility and examples demonstrating its usage in GPU-accelerated programs. The CUDA Timer is designed to measure the execution time of GPU kernels using CUDA events. 4 | 5 | ## Usage 6 | ### CudaTimer API 7 | 8 | The `CudaTimer` singleton provides a comprehensive timing interface for CUDA operations: 9 | 10 | - `start(label)` - Begins timing a labeled operation 11 | - `stop(label)` - Ends timing for the labeled operation 12 | - `time(label)` - Context manager for automatic timing (recommended usage) 13 | - `reset()` - Clears all collected timing data 14 | - `summary()` - Returns statistical analysis of timing measurements 15 | - `get_latest_measurement(label)` - Gets the latest measurement (in ms) for a given section 16 | - `print_summary()` - Displays formatted timing statistics to console 17 | 18 | ### Within SPMD workloads 19 | We provide an example of CudaTimer within SPMD workloads at [example_spmd.py](example_spmd.py). 20 | 21 | ``` 22 | import torch 23 | from monarch.timer import CudaTimer 24 | 25 | def main(): 26 | if not torch.cuda.is_available(): 27 | print("CUDA is not available. Exiting.") 28 | return 29 | 30 | device = torch.device("cuda") 31 | a = torch.randn(1000, 1000, device=device) 32 | b = torch.randn(1000, 1000, device=device) 33 | 34 | with CudaTimer.time("matrix_multiply"): 35 | result = torch.matmul(a, b) 36 | 37 | CudaTimer.print_summary() 38 | 39 | ``` 40 | 41 | ### Within Monarch workloads 42 | We provide an example of CudaTimer within Monarch workloads at [example_monarch.py](example_monarch.py). 43 | 44 | ``` 45 | import torch 46 | from monarch import inspect, remote 47 | from monarch.rust_local_mesh import local_mesh 48 | 49 | cuda_timer_start = remote("monarch.timer.remote_cuda_timer.cuda_timer_start", propagate="inspect") 50 | cuda_timer_stop = remote("monarch.timer.remote_cuda_timer.cuda_timer_stop", propagate="inspect") 51 | 52 | def main(): 53 | mesh = local_mesh(hosts=1, gpus_per_host=1) 54 | 55 | with mesh.activate(): 56 | a = torch.randn(1000, 1000, device="cuda") 57 | b = torch.randn(1000, 1000, device="cuda") 58 | 59 | cuda_timer_start() 60 | result = torch.matmul(a, b) 61 | cuda_timer_stop() 62 | 63 | cuda_average_ms = get_cuda_timer_average_ms() 64 | local_cuda_avg_ms = inspect(cuda_average_ms).item() 65 | 66 | mesh.exit() 67 | print(f"average time w/ CudaTimer: {local_cuda_avg_ms:.4f} (ms)") 68 | ``` 69 | -------------------------------------------------------------------------------- /python/monarch/timer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .execution_timer import ( 8 | execution_timer_start, 9 | execution_timer_stop, 10 | ExecutionTimer, 11 | get_execution_timer_average_ms, 12 | get_latest_timer_measurement, 13 | ) 14 | 15 | __all__ = [ 16 | "ExecutionTimer", 17 | "execution_timer_start", 18 | "execution_timer_stop", 19 | "get_latest_timer_measurement", 20 | "get_execution_timer_average_ms", 21 | ] 22 | -------------------------------------------------------------------------------- /python/monarch/timer/example_monarch.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | """An example that demonstrates how to use ExecutionTimer with a Monarch program. 8 | 9 | Run this with 10 | buck run //monarch/python/monarch/timer:example_monarch 11 | 12 | """ 13 | # pyre-unsafe 14 | 15 | import logging 16 | 17 | import torch 18 | 19 | from monarch import inspect, remote 20 | from monarch.rust_local_mesh import local_mesh 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | execution_timer_start = remote( 26 | "monarch.timer.remote_execution_timer.execution_timer_start", propagate="inspect" 27 | ) 28 | 29 | execution_timer_stop = remote( 30 | "monarch.timer.remote_execution_timer.execution_timer_stop", propagate="inspect" 31 | ) 32 | 33 | get_execution_timer_average_ms = remote( 34 | "monarch.timer.remote_execution_timer.get_execution_timer_average_ms", 35 | propagate=lambda: torch.tensor(0.0, dtype=torch.float64), 36 | ) 37 | 38 | get_time_perfcounter = remote( 39 | "monarch.timer.remote_execution_timer.get_time_perfcounter", 40 | propagate=lambda: torch.tensor(0.0, dtype=torch.float64), 41 | ) 42 | 43 | 44 | def main() -> None: 45 | with local_mesh(hosts=1, gpus_per_host=1) as mesh: 46 | with mesh.activate(): 47 | num_iterations = 5 48 | 49 | a = torch.randn(1000, 1000, device="cuda") 50 | b = torch.randn(1000, 1000, device="cuda") 51 | torch.matmul(a, b) 52 | 53 | total_dt = torch.zeros(1, dtype=torch.float64) 54 | 55 | for _ in range(num_iterations): 56 | t0 = get_time_perfcounter() 57 | torch.matmul(a, b) 58 | total_dt += get_time_perfcounter() - t0 59 | 60 | for _ in range(num_iterations): 61 | execution_timer_start() 62 | torch.matmul(a, b) 63 | execution_timer_stop() 64 | 65 | cuda_average_ms = get_execution_timer_average_ms() 66 | local_total_dt = inspect(total_dt) 67 | local_cuda_avg_ms = inspect(cuda_average_ms) 68 | 69 | local_total_dt = local_total_dt.item() 70 | local_cuda_avg_ms = local_cuda_avg_ms.item() 71 | mesh.exit() 72 | avg_perfcounter_ms = local_total_dt / num_iterations * 1000 73 | print(f"average time w/ perfcounter: {avg_perfcounter_ms:.4f} (ms)") 74 | print(f"average time w/ ExecutionTimer: {local_cuda_avg_ms:.4f} (ms)") 75 | 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /python/monarch/timer/example_spmd.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | """An example that demonstrates how to use ExecutionTimer in a SPMD style program. 8 | 9 | Run this with: 10 | buck run //monarch/python/monarch/timer:example_spmd 11 | """ 12 | 13 | import time 14 | 15 | # pyre-strict 16 | 17 | import torch 18 | from monarch.timer import ExecutionTimer 19 | 20 | 21 | def main() -> None: 22 | # Check if CUDA is available 23 | if not torch.cuda.is_available(): 24 | print("CUDA is not available. Exiting.") 25 | return 26 | 27 | device = torch.device("cuda") 28 | 29 | num_iterations = 5 30 | 31 | a = torch.randn(1000, 1000, device=device) 32 | b = torch.randn(1000, 1000, device=device) 33 | 34 | # Warmup 35 | torch.matmul(a, b) 36 | torch.cuda.synchronize() 37 | 38 | cpu_timings = [] 39 | for _ in range(num_iterations): 40 | t0 = time.perf_counter() 41 | torch.matmul(a, b) 42 | cpu_timings.append(time.perf_counter() - t0) 43 | 44 | for _ in range(num_iterations): 45 | with ExecutionTimer.time("matrix_multiply"): 46 | torch.matmul(a, b) 47 | 48 | mean_cuda_ms = ExecutionTimer.summary()["matrix_multiply"]["mean_ms"] 49 | mean_perfcounter_ms = sum(cpu_timings) / len(cpu_timings) * 1000 50 | print("mean perf counter times: ", mean_perfcounter_ms) 51 | print("mean cuda times: ", mean_cuda_ms) 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /python/monarch/tools/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | -------------------------------------------------------------------------------- /python/monarch/tools/components/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | -------------------------------------------------------------------------------- /python/monarch/tools/components/hyperactor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | import getpass 9 | from typing import Optional 10 | 11 | from monarch.tools import mesh_spec 12 | from monarch.tools.mesh_spec import mesh_spec_from_str 13 | from torchx import specs 14 | 15 | _DEFAULT_MESHES = ["mesh_0:1:gpu.small"] 16 | 17 | _USER: str = getpass.getuser() 18 | 19 | __version__ = "latest" # TODO get version from monarch.__version_ 20 | 21 | 22 | def proc_mesh( 23 | name: str = f"monarch-{_USER}", 24 | image: str = f"ghcr.io/pytorch-labs/monarch:{__version__}", # TODO docker needs to be built and pushed to ghcr 25 | meshes: list[str] = _DEFAULT_MESHES, 26 | env: Optional[dict[str, str]] = None, 27 | port: int = mesh_spec.DEFAULT_REMOTE_ALLOCATOR_PORT, 28 | ) -> specs.AppDef: 29 | """ 30 | Args: 31 | name: the name of the monarch server job 32 | image: docker image to run the job on, for slurm, image is the dir the job is run from 33 | meshes: list of mesh specs of the form "{name}:{num_hosts}:{host_type}" 34 | env: environment variables to be passed to the main command (e.g. ENV1=v1,ENV2=v2,ENV3=v3) 35 | port: the port that the remote process allocator runs on (must be reachable from the client) 36 | """ 37 | 38 | appdef = specs.AppDef(name) 39 | 40 | for mesh in [mesh_spec_from_str(mesh) for mesh in meshes]: 41 | mesh_role = specs.Role( 42 | name=mesh.name, 43 | image=image, 44 | entrypoint="process_allocator", # 'cargo install monarch_hyperactor' to get this binary 45 | args=[ 46 | "mesh-worker", 47 | f"--port={port}", 48 | "--program=monarch_bootstrap", # installed with monarch wheel (as console script) 49 | ], 50 | num_replicas=mesh.num_hosts, 51 | resource=specs.resource(h=mesh.host_type), 52 | env=env or {}, 53 | port_map={"mesh": port}, 54 | ) 55 | appdef.roles.append(mesh_role) 56 | 57 | return appdef 58 | -------------------------------------------------------------------------------- /python/monarch/tools/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | from dataclasses import dataclass, field 9 | from typing import Any, Optional 10 | 11 | 12 | NOT_SET: str = "__NOT_SET__" 13 | 14 | 15 | @dataclass 16 | class Config: 17 | scheduler: str = NOT_SET 18 | scheduler_args: dict[str, Any] = field(default_factory=dict) 19 | workspace: Optional[str] = None 20 | dryrun: bool = False 21 | -------------------------------------------------------------------------------- /python/monarch/tools/config/defaults.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | """Defines defaults for ``monarch.tools``""" 10 | 11 | from typing import Callable, Optional 12 | 13 | from monarch.tools.components import hyperactor 14 | from monarch.tools.config import Config 15 | 16 | from torchx import specs 17 | from torchx.schedulers import ( 18 | docker_scheduler, 19 | kubernetes_scheduler, 20 | local_scheduler, 21 | SchedulerFactory, 22 | slurm_scheduler, 23 | ) 24 | 25 | 26 | def component_fn(scheduler: str) -> Callable[..., specs.AppDef]: 27 | """The default TorchX component function for the scheduler""" 28 | return hyperactor.proc_mesh 29 | 30 | 31 | def scheduler_factories() -> dict[str, SchedulerFactory]: 32 | """Supported schedulers (name -> scheduler static factory method)""" 33 | return { # pyre-ignore[7] 34 | # --- local schedulers (no multi-host support) --- 35 | "local_cwd": local_scheduler.create_scheduler, 36 | "local_docker": docker_scheduler.create_scheduler, 37 | # --- remote schedulers (yes multi-host support) --- 38 | "slurm": slurm_scheduler.create_scheduler, 39 | "k8s": kubernetes_scheduler.create_scheduler, 40 | } 41 | 42 | 43 | def config(scheduler: str, workspace: Optional[str] = None) -> Config: 44 | """The default :py:class:`~monarch.tools.config.Config` to use when submitting to the provided ``scheduler``.""" 45 | return Config(scheduler=scheduler, workspace=workspace) 46 | 47 | 48 | def dryrun_info_formatter(dryrun_info: specs.AppDryRunInfo) -> Callable[..., str]: 49 | """Used to attach a formatter to the dryrun info when running 50 | :py:function:`~monarch.tools.commands.create` in ``dryrun`` mode so that 51 | the returned ``AppDryrunInfo`` can be printed to console. 52 | """ 53 | # no-op, use the default formatter already attached to the dryrun info 54 | return dryrun_info._fmt 55 | -------------------------------------------------------------------------------- /python/monarch/worker/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | -------------------------------------------------------------------------------- /python/monarch/worker/lines.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | from contextlib import contextmanager 9 | from typing import Any, List 10 | 11 | 12 | class Lines: 13 | """ 14 | Simple way to emit code where we track a per-line context object. 15 | """ 16 | 17 | def __init__(self, context=None): 18 | self._lines: List[str] = [] 19 | self._context: List[Any] = [] 20 | self._current_context = context 21 | 22 | def get_context(self, lineno) -> Any: 23 | return self._context[lineno - 1] 24 | 25 | @contextmanager 26 | def context(self, obj: Any): 27 | old, self._current_context = self._current_context, obj 28 | try: 29 | yield 30 | finally: 31 | self._current_context = old 32 | 33 | def emit(self, lines: str) -> None: 34 | self._lines.extend(lines.split("\n")) 35 | while len(self._context) < len(self._lines): 36 | self._context.append(self._current_context) 37 | 38 | def emit_lines(self, lines: "Lines") -> None: 39 | """ 40 | Append another lines object on this one, 41 | preserving its per-line context. 42 | """ 43 | self._lines.extend(lines._lines) 44 | self._context.extend(lines._context) 45 | 46 | def text(self) -> str: 47 | return "\n".join(self._lines) 48 | -------------------------------------------------------------------------------- /python/monarch/worker/monitor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | import math 10 | import queue 11 | import threading 12 | from typing import Callable, Optional, Tuple 13 | 14 | from monarch_supervisor import TTL 15 | 16 | 17 | class Monitor: 18 | """A monitor is a thread that watches for reported events to expire.""" 19 | 20 | def __init__(self) -> None: 21 | self.thread = threading.Thread(target=self._main, daemon=True, name="monitor") 22 | self.events: queue.Queue[Tuple[Callable[[], None], Callable[[], float]]] = ( 23 | queue.Queue() 24 | ) 25 | self.events.put((lambda: None, TTL(None))) 26 | 27 | def start(self) -> None: 28 | """Start the monitor thread.""" 29 | self.thread.start() 30 | 31 | def _main(self) -> None: 32 | debug, ttl = self.events.get() 33 | while True: 34 | try: 35 | timeout = ttl() 36 | next_debug, next_ttl = self.events.get( 37 | timeout=None if timeout == math.inf else timeout 38 | ) 39 | except queue.Empty: 40 | debug() 41 | next_debug, next_ttl = self.events.get(timeout=None) 42 | 43 | debug, ttl = next_debug, next_ttl 44 | 45 | def __call__( 46 | self, 47 | debug_fn: Callable[[], None] = lambda: None, 48 | timeout: Optional[float] = None, 49 | ) -> None: 50 | """Start a new event with the provided timeout. 51 | If a timeout is specified, and a new event is not reported by before it expires, 52 | the provided debug_fn is called.""" 53 | self.events.put((debug_fn, TTL(timeout))) 54 | -------------------------------------------------------------------------------- /python/monarch/world_mesh.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | 9 | from typing import List 10 | 11 | from monarch.common.client import Client 12 | 13 | from monarch.common.device_mesh import DeviceMesh 14 | from monarch.common.shape import NDSlice 15 | 16 | from monarch.controller.backend import ProcessBackend 17 | 18 | from monarch.controller.controller import Controller 19 | from monarch_supervisor import Context, Host 20 | 21 | 22 | def world_mesh( 23 | ctx: Context, 24 | hosts: List[Host], 25 | gpu_per_host: int, 26 | _processes=None, 27 | ) -> DeviceMesh: 28 | backend = ProcessBackend(ctx, hosts, gpu_per_host, _processes=_processes) 29 | client = Client(Controller(backend), backend.world_size, backend.gpu_per_host) 30 | return DeviceMesh( 31 | client, 32 | NDSlice(offset=0, sizes=[len(hosts), gpu_per_host], strides=[gpu_per_host, 1]), 33 | ("host", "gpu"), 34 | ) 35 | -------------------------------------------------------------------------------- /python/monarch_supervisor/_testing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | from typing import NamedTuple 9 | 10 | from monarch_supervisor import get_message_queue 11 | 12 | 13 | class Reply(NamedTuple): 14 | a: int 15 | b: int 16 | x: int 17 | 18 | 19 | def reply_hello(a, b, x): 20 | q = get_message_queue() 21 | q.send(Reply(a, b, x)) 22 | 23 | 24 | def echo(): 25 | q = get_message_queue() 26 | i = 0 27 | while True: 28 | sender, m = q.recv() 29 | if m == "exit": 30 | break 31 | assert m == i 32 | q.send(m) 33 | i += 1 34 | 35 | 36 | class Mapper: 37 | def map(self, items): 38 | return sum(x * 2 for x in items) 39 | 40 | def reduce(self, items): 41 | return sum(items) 42 | 43 | def finish(self, result): 44 | return result 45 | -------------------------------------------------------------------------------- /python/monarch_supervisor/diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/monarch/2876b0c5f131c092549d68c645d2ba37dfde857a/python/monarch_supervisor/diagram.png -------------------------------------------------------------------------------- /python/monarch_supervisor/function_call.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | import importlib.util 9 | import sys 10 | 11 | from monarch_supervisor import _FunctionCall, get_message_queue 12 | 13 | if __name__ == "__main__": 14 | q = get_message_queue() 15 | _, call = q.recv() 16 | assert isinstance(call, _FunctionCall) 17 | filename, *rest = call.target.split(":", 1) 18 | if not rest: 19 | modulename, funcname = filename.rsplit(".", 1) 20 | module = importlib.import_module(modulename) 21 | else: 22 | spec = importlib.util.spec_from_file_location("__entry__", filename) 23 | assert spec is not None and spec.loader is not None 24 | module = importlib.util.module_from_spec(spec) 25 | # pyre-ignore[16] 26 | spec.loader.exec_module(module) 27 | sys.modules["__entry__"] = module 28 | funcname = rest[0] 29 | func = getattr(module, funcname) 30 | func(*call.args, **call.kwargs) 31 | -------------------------------------------------------------------------------- /python/monarch_supervisor/log_pstree.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | import logging 9 | import subprocess 10 | import sys 11 | from typing import Optional 12 | 13 | from monarch_supervisor.logging import gethostname, initialize_logging 14 | 15 | pid: str 16 | logger: logging.Logger = logging.getLogger(__name__) 17 | 18 | 19 | def extract_pss(pid: str) -> Optional[str]: 20 | try: 21 | with open(f"/proc/{pid}/smaps_rollup", "r") as f: 22 | for line in f.readlines(): 23 | if line.startswith("Pss:"): # Check if the line starts with 'Pss:' 24 | return " ".join(line.split()[1:3]) 25 | except Exception: 26 | pass 27 | return None 28 | 29 | 30 | def log_pstree_output(pid: int) -> None: 31 | pstree_output = subprocess.check_output(["pstree", "-Tap", str(pid)]).decode( 32 | "utf-8" 33 | ) 34 | lines = pstree_output.split("\n") 35 | logger.info("Process Info") 36 | for line in lines: 37 | if not line.strip(): 38 | continue 39 | parts = line.split(",") 40 | pids = parts[1].split()[0] 41 | mem = extract_pss(pids) 42 | logger.info(f"{line} {mem}") 43 | 44 | 45 | if __name__ == "__main__": 46 | (pid,) = sys.argv[1:] 47 | initialize_logging(f"{gethostname()} host-manager") 48 | log_pstree_output(int(pid)) 49 | -------------------------------------------------------------------------------- /python/monarch_supervisor/python_executable.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import importlib.resources 8 | import os 9 | import sys 10 | 11 | try: 12 | from __manifest__ import fbmake # noqa 13 | 14 | IN_PAR = True 15 | except ImportError: 16 | IN_PAR = False 17 | 18 | PYTHON_EXECUTABLE: str 19 | if IN_PAR: 20 | # The worker bootstrap binary will import this supervisor lib. When that 21 | # happens don't try to search for the bootstrap binary again, just use the 22 | # current executable. 23 | import __main__ as main_module # @manual 24 | 25 | if hasattr(main_module, "__MONARCH_TENSOR_WORKER_ENV__"): 26 | PYTHON_EXECUTABLE = os.environ["FB_XAR_INVOKED_NAME"] 27 | else: 28 | try: 29 | with importlib.resources.path( 30 | "monarch_tensor_worker_env", "worker_env" 31 | ) as path: 32 | if not path.exists(): 33 | raise ImportError() 34 | PYTHON_EXECUTABLE = str(path) 35 | except ImportError: 36 | raise ImportError( 37 | "Monarch worker env not found, please define a custom 'monarch_tensor_worker_env' or " 38 | "add '//monarch/python/monarch_supervisor/worker:default_worker_env' " 39 | "to your binary dependencies in TARGETS" 40 | ) 41 | else: 42 | PYTHON_EXECUTABLE = sys.executable 43 | -------------------------------------------------------------------------------- /python/monarch_supervisor/worker/worker_env.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | import runpy 9 | import sys 10 | 11 | __MONARCH_TENSOR_WORKER_ENV__ = True 12 | 13 | 14 | def main() -> None: 15 | assert sys.argv[1] == "-m" 16 | main_module = sys.argv[2] 17 | 18 | # Remove the -m and the main module from the command line arguments before 19 | # forwarding 20 | sys.argv[1:] = sys.argv[3:] 21 | # pyre-fixme[16]: Module `runpy` has no attribute `_run_module_as_main`. 22 | runpy._run_module_as_main(main_module, alter_argv=False) 23 | 24 | 25 | if __name__ == "__main__": 26 | main() 27 | -------------------------------------------------------------------------------- /python/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/monarch/2876b0c5f131c092549d68c645d2ba37dfde857a/python/tests/__init__.py -------------------------------------------------------------------------------- /python/tests/_monarch/test_client.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | from unittest import TestCase 10 | 11 | import torch 12 | from monarch._rust_bindings.monarch_extension import client 13 | 14 | from monarch._rust_bindings.monarch_hyperactor.proc import ActorId 15 | from pyre_extensions import none_throws 16 | 17 | 18 | class TestClient(TestCase): 19 | def test_simple_with_error_response(self) -> None: 20 | err = client.Error.new_for_unit_test( 21 | 7, 22 | 8, 23 | ActorId(world_name="test", rank=0, actor_name="actor"), 24 | "test error", 25 | ) 26 | resp = client.WorkerResponse.new_for_unit_test( 27 | seq=10, 28 | response=err, 29 | ) 30 | self.assertTrue(resp.is_exception()) 31 | exc = none_throws(resp.exception()) 32 | assert isinstance(exc, client.Error) 33 | 34 | self.assertEqual(exc.backtrace, "test error") 35 | self.assertEqual(resp.result(), None) 36 | self.assertEqual(resp.seq, 10) 37 | 38 | def test_simple_with_result_response(self) -> None: 39 | resp = client.WorkerResponse.new_for_unit_test( 40 | seq=11, 41 | response={"test": 1}, 42 | ) 43 | self.assertFalse(resp.is_exception()) 44 | self.assertEqual(resp.exception(), None) 45 | self.assertEqual(resp.result(), {"test": 1}) 46 | self.assertEqual(resp.seq, 11) 47 | 48 | def test_tensor(self) -> None: 49 | tensor = torch.rand(3) 50 | resp = client.WorkerResponse.new_for_unit_test( 51 | seq=11, 52 | response={"result": tensor}, 53 | ) 54 | self.assertTrue(torch.equal(resp.result()["result"], tensor)) 55 | -------------------------------------------------------------------------------- /python/tests/builtins/test_log.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | import logging 9 | from unittest.mock import patch 10 | 11 | import pytest 12 | 13 | from monarch._testing import BackendType, TestingContext 14 | from monarch.builtins.log import log_remote, set_logging_level_remote 15 | 16 | 17 | @pytest.fixture(scope="module", autouse=True) 18 | def testing_context(): 19 | global local 20 | with TestingContext() as local: 21 | yield 22 | 23 | 24 | @pytest.mark.timeout(120) 25 | @pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS]) 26 | class TestLogFunctions: 27 | @classmethod 28 | def local_device_mesh(cls, num_hosts, gpu_per_host, backend_type, activate=True): 29 | return local.local_device_mesh( 30 | num_hosts, 31 | gpu_per_host, 32 | activate, 33 | rust=backend_type == BackendType.RS, 34 | ) 35 | 36 | @patch("monarch.builtins.log.logger") 37 | def test_log_remote_default_level(self, mock_log, backend_type): 38 | with self.local_device_mesh(1, 1, backend_type): 39 | log_remote("test warning message") 40 | 41 | @patch("monarch.builtins.log.logger") 42 | def test_log_remote_with_args(self, mock_log, backend_type): 43 | with self.local_device_mesh(1, 1, backend_type): 44 | log_remote("test message with %s and %d", "str", 42) 45 | 46 | @patch("monarch.builtins.log.logger") 47 | def test_set_logging_level_remote(self, mock_logger, backend_type): 48 | with self.local_device_mesh(1, 1, backend_type): 49 | set_logging_level_remote(logging.DEBUG) 50 | 51 | @patch("monarch.builtins.log.logger") 52 | def test_log_remote_custom_level(self, mock_log, backend_type): 53 | with self.local_device_mesh(1, 1, backend_type): 54 | set_logging_level_remote(logging.ERROR) 55 | log_remote("ignored info message", level=logging.INFO) 56 | log_remote("seen error message", level=logging.ERROR) 57 | 58 | @patch("monarch.builtins.log.logger") 59 | def test_log_remote_multiple_calls(self, mock_log, backend_type): 60 | with self.local_device_mesh(1, 1, backend_type): 61 | log_remote("First message") 62 | log_remote("Second message", level=logging.INFO) 63 | log_remote("Third message", level=logging.ERROR) 64 | -------------------------------------------------------------------------------- /python/tests/dispatch_bench_helper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | import torch 9 | 10 | from monarch.common.remote import remote 11 | 12 | 13 | def run_loop_local(n_iters, tensor_shape=(2, 2)): 14 | local = torch.zeros(*tensor_shape) 15 | ones = torch.ones(*tensor_shape) 16 | for _ in range(n_iters): 17 | local = ones + local 18 | return local 19 | 20 | 21 | def _run_loop(*args, **kwargs): 22 | return torch.ones(args[1]) 23 | 24 | 25 | run_loop = remote("tests.dispatch_bench_helper.run_loop_local", propagate=_run_loop) 26 | -------------------------------------------------------------------------------- /python/tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-timeout 3 | pytest-asyncio 4 | -------------------------------------------------------------------------------- /python/tests/simulator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch-labs/monarch/2876b0c5f131c092549d68c645d2ba37dfde857a/python/tests/simulator/__init__.py -------------------------------------------------------------------------------- /python/tests/simulator/test_task.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | import unittest 9 | 10 | from monarch.simulator.task import Task, TaskState, WorkerTaskManager 11 | 12 | 13 | class TestTask(unittest.TestCase): 14 | def test_worker_task_manager(self): 15 | manager = WorkerTaskManager() 16 | kwargs = { 17 | "inputs": [2], 18 | "outputs": [3], 19 | "command_id": 1, 20 | "start_time": 9, 21 | "runtime": 1, 22 | "meta": ["a"], 23 | } 24 | task = Task(**kwargs) 25 | task._state = TaskState.EXECUTED 26 | 27 | manager.add(task) 28 | # This task is executed. 29 | manager.remove(task) 30 | 31 | task2 = Task(**kwargs) 32 | task2.dependencies = [task] 33 | manager.add(task2) 34 | 35 | collectives = [] 36 | collective_task = Task(collectives=collectives, **kwargs) 37 | collective_task.dependencies = [task2] 38 | manager.add(collective_task) 39 | # This is from another worker. Don't add it to the manager. 40 | other_worker_task = Task(**kwargs) 41 | 42 | collectives.append(other_worker_task) 43 | wait_task = Task(waits=[task], **kwargs) 44 | manager.add(wait_task) 45 | 46 | cloned_manager = manager.clone() 47 | 48 | self.assertEqual(len(manager.tasks), 3) 49 | self.assertEqual(manager.tasks.keys(), cloned_manager.tasks.keys()) 50 | cloned_task2 = cloned_manager.tasks[task2.task_id] 51 | self.assertNotEqual(task2, cloned_task2) 52 | for k in kwargs.keys(): 53 | self.assertEqual(getattr(cloned_task2, k), getattr(task2, k)) 54 | self.assertEqual(cloned_task2.dependencies[0].task_id, task.task_id) 55 | self.assertNotEqual(cloned_task2.dependencies[0], task) 56 | cloned_wait_task = cloned_manager.tasks[wait_task.task_id] 57 | self.assertEqual(cloned_wait_task.waits[0].task_id, task.task_id) 58 | self.assertNotEqual(cloned_wait_task.waits[0], task) 59 | 60 | self.assertEqual(len(collectives), 3) 61 | cloned_collective_task = cloned_manager.tasks[collective_task.task_id] 62 | self.assertTrue(collective_task in collectives) 63 | self.assertTrue(cloned_collective_task in collectives) 64 | self.assertNotEqual(collective_task, cloned_collective_task) 65 | -------------------------------------------------------------------------------- /python/tests/sleep_binary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Meta Platforms, Inc. and affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | 8 | # pyre-strict 9 | 10 | """ 11 | A simple binary that calls the sleep_indefinitely_for_unit_tests function from the monarch extension. 12 | This is used to test the signal handling behavior of signal_safe_block_on. 13 | """ 14 | 15 | import sys 16 | 17 | from monarch._rust_bindings.monarch_hyperactor.runtime import ( # @manual 18 | sleep_indefinitely_for_unit_tests, 19 | ) 20 | 21 | 22 | def main() -> None: 23 | print("Starting sleep_binary. Process will sleep indefinitely until interrupted.") 24 | sys.stdout.flush() # Ensure the message is printed before we sleep 25 | 26 | try: 27 | # This will sleep indefinitely until interrupted by a signal 28 | sleep_indefinitely_for_unit_tests() 29 | except KeyboardInterrupt: 30 | print("Received KeyboardInterrupt, exiting.") 31 | sys.exit(0) 32 | 33 | 34 | if __name__ == "__main__": 35 | main() 36 | -------------------------------------------------------------------------------- /python/tests/test_alloc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | 9 | from unittest import IsolatedAsyncioTestCase 10 | 11 | from monarch import ProcessAllocator 12 | from monarch._rust_bindings.hyperactor_extension.alloc import ( # @manual=//monarch/monarch_extension:monarch_extension 13 | AllocConstraints, 14 | AllocSpec, 15 | ) 16 | 17 | 18 | class TestAlloc(IsolatedAsyncioTestCase): 19 | async def test_basic(self) -> None: 20 | cmd = "echo hello" 21 | allocator = ProcessAllocator(cmd) 22 | spec = AllocSpec(AllocConstraints(), replica=2) 23 | alloc = await allocator.allocate(spec) 24 | 25 | print(alloc) 26 | -------------------------------------------------------------------------------- /python/tests/test_sim_backend.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-unsafe 8 | 9 | from contextlib import contextmanager 10 | from typing import Generator, Optional 11 | from unittest import TestCase 12 | 13 | import pytest 14 | 15 | import torch 16 | from monarch import fetch_shard 17 | from monarch.common.device_mesh import DeviceMesh 18 | from monarch.sim_mesh import sim_mesh 19 | 20 | 21 | @contextmanager 22 | def local_sim_mesh( 23 | hosts: int = 1, 24 | # TODO: support multiple gpus in a mesh. 25 | gpu_per_host: int = 1, 26 | activate: bool = True, 27 | proxy_addr: Optional[str] = None, 28 | ) -> Generator[DeviceMesh, None, None]: 29 | dms = sim_mesh( 30 | n_meshes=1, hosts=hosts, gpus_per_host=gpu_per_host, proxy_addr=proxy_addr 31 | ) 32 | dm = dms[0] 33 | try: 34 | if activate: 35 | with dm.activate(): 36 | yield dm 37 | else: 38 | yield dm 39 | dm.exit() 40 | except Exception: 41 | dm.client._shutdown = True 42 | raise 43 | 44 | 45 | # oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited 46 | @pytest.mark.oss_skip 47 | class TestSimBackend(TestCase): 48 | def test_local_mesh_setup(self): 49 | with local_sim_mesh(): 50 | t = torch.zeros(3, 4) 51 | t.add_(1) 52 | local_t = fetch_shard(t).result() 53 | # consider support specifying the return value in the mock worker. 54 | assert local_t is not None 55 | -------------------------------------------------------------------------------- /python/tests/tools/config/test_defaults.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | import unittest 9 | from pathlib import Path 10 | 11 | from monarch.tools.config import ( # @manual=//monarch/python/monarch/tools/config/meta:defaults 12 | defaults, 13 | ) 14 | from torchx.specs.builders import _create_args_parser 15 | 16 | 17 | class TestDefaults(unittest.TestCase): 18 | def test_default_config(self) -> None: 19 | for scheduler in defaults.scheduler_factories(): 20 | with self.subTest(scheduler=scheduler): 21 | config = defaults.config(scheduler) 22 | 23 | # make sure that we've set the scheduler name when returning the config 24 | self.assertEqual(scheduler, config.scheduler) 25 | 26 | # make sure a new Config is returned each time 27 | # by modifying the returned config 28 | # -> re-getting the default configs for the same scheduler 29 | # -> validating the changes are not persisted in the new config 30 | self.assertNotIn("foo", config.scheduler_args) 31 | config.scheduler_args["foo"] = "bar" 32 | self.assertNotIn("foo", defaults.config(scheduler).scheduler_args) 33 | 34 | def test_default_config_workspace(self) -> None: 35 | current_working_dir = str(Path.cwd()) 36 | config = defaults.config("local_cwd", current_working_dir) 37 | self.assertEqual(current_working_dir, config.workspace) 38 | 39 | def test_default_scheduler_factories(self) -> None: 40 | # just make sure the common schedulers are present 41 | self.assertIn("local_cwd", defaults.scheduler_factories()) 42 | self.assertIn("slurm", defaults.scheduler_factories()) 43 | 44 | def test_default_component(self) -> None: 45 | # just make sure there exists a default component for each configured scheduler 46 | # and that the returned default component is a valid component 47 | for scheduler in defaults.scheduler_factories(): 48 | with self.subTest(scheduler=scheduler): 49 | component_fn = defaults.component_fn(scheduler) 50 | 51 | # the following will fail if the component_fn is not a valid torchx component 52 | with self.assertRaises(SystemExit): 53 | _create_args_parser(component_fn).parse_args(["--help"]) 54 | -------------------------------------------------------------------------------- /python/tests/tools/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pyre-strict 8 | import contextlib 9 | import io 10 | from typing import Generator 11 | 12 | 13 | @contextlib.contextmanager 14 | def capture_stdout() -> Generator[io.StringIO, None, None]: 15 | with io.StringIO() as buf, contextlib.redirect_stdout(buf): 16 | yield buf 17 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | pyzmq 3 | requests 4 | numpy 5 | pyre-extensions 6 | cloudpickle 7 | torchx-nightly 8 | -------------------------------------------------------------------------------- /rust-toolchain: -------------------------------------------------------------------------------- 1 | # @rustc_version: rustc 1.87.0-nightly (920d95eaf 2025-03-28) 2 | [toolchain] 3 | channel = "nightly-2025-03-29" 4 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | # Get help on options with `rustfmt --help=config` 2 | # Please keep these in alphabetical order. 3 | edition = "2021" 4 | format_code_in_doc_comments = true 5 | group_imports = "StdExternalCrate" 6 | imports_granularity = "Item" 7 | merge_derives = false 8 | style_edition = "2024" 9 | use_field_init_shorthand = true 10 | -------------------------------------------------------------------------------- /timed_test/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/timed_test:[timed_test,timed_test_test] 2 | 3 | [package] 4 | name = "timed_test" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | 10 | [lib] 11 | test = false 12 | doctest = false 13 | proc-macro = true 14 | 15 | [[test]] 16 | name = "timed_test_test" 17 | path = "tests/basic.rs" 18 | 19 | [dependencies] 20 | quote = "1.0.29" 21 | syn = { version = "2.0.101", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } 22 | 23 | [dev-dependencies] 24 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 25 | -------------------------------------------------------------------------------- /timed_test/tests/basic.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use timed_test::async_timed_test; 10 | 11 | #[async_timed_test(timeout_secs = 5)] 12 | async fn good() { 13 | tokio::time::sleep(tokio::time::Duration::from_secs(2)).await; 14 | } 15 | 16 | #[async_timed_test(timeout_secs = 1)] 17 | #[should_panic] 18 | async fn bad() { 19 | tokio::time::sleep(tokio::time::Duration::from_secs(2)).await; 20 | } 21 | -------------------------------------------------------------------------------- /tools/rust/ossconfigs/clippy.toml: -------------------------------------------------------------------------------- 1 | disallowed-methods = [ 2 | { path = "tokio::time::sleep", reason = "use `hyperactor::clock::Clock::sleep` instead." }, 3 | { path = "std::thread::sleep", reason = "use `hyperactor::clock::Clock::sleep` instead." }, 4 | { path = "tokio::time::Instant::now", reason = "use `hyperactor::clock::Clock::now` instead." }, 5 | { path = "std::time::SystemTime::now", reason = "use `hyperactor::clock::Clock::system_time_now` instead." }, 6 | ] 7 | -------------------------------------------------------------------------------- /torch-sys/Cargo.toml: -------------------------------------------------------------------------------- 1 | # @generated by autocargo from //monarch/torch-sys:torch-sys 2 | 3 | [package] 4 | name = "torch-sys" 5 | version = "0.0.0" 6 | authors = ["Meta"] 7 | edition = "2021" 8 | license = "BSD-3-Clause" 9 | links = "torch" 10 | 11 | [dependencies] 12 | anyhow = "1.0.95" 13 | async-trait = "0.1.86" 14 | atomic_refcell = "0.1.13" 15 | cxx = "1.0.119" 16 | derive_more = { version = "1.0.0", features = ["full"] } 17 | fxhash = "0.2.1" 18 | monarch_types = { version = "0.0.0", path = "../monarch_types" } 19 | nccl-sys = { path = "../nccl-sys" } 20 | paste = "1.0.14" 21 | pyo3 = { version = "0.22.6", features = ["anyhow"] } 22 | regex = "1.11.1" 23 | serde = { version = "1.0.185", features = ["derive", "rc"] } 24 | thiserror = "2.0.12" 25 | tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] } 26 | tracing = { version = "0.1.41", features = ["attributes", "valuable"] } 27 | 28 | [dev-dependencies] 29 | bincode = "1.3.3" 30 | 31 | [build-dependencies] 32 | bindgen = "0.70.1" 33 | cxx-build = "1.0.119" 34 | -------------------------------------------------------------------------------- /torch-sys/README.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | 3 | See the source documentation or `bunnylol rustdoc torch-sys` to see docs. 4 | 5 | # Cargo build 6 | 7 | The cargo build requires that you have a version of PyTorch installed in your 8 | Python environment. To get set up, run the following on your devgpu: 9 | 10 | ```sh 11 | # get conda on devserver 12 | sudo feature install genai_conda 13 | 14 | # Set up conda env 15 | conda create -n monarch 16 | conda activate monarch 17 | 18 | # install pytorch 19 | conda install pytorch pytorch-cuda=12.4 -c pytorch -c nvidia 20 | 21 | # install cuda toolkit on devserver (requires devgpu) 22 | sudo dnf install cuda-12-0 23 | 24 | # install nccl on devserver (requires devgpu) 25 | sudo dnf install libnccl-devel 26 | 27 | # install libclang on devserver (needed for rust-bindgen) 28 | sudo dnf install clang-devel 29 | 30 | # in monarch/torch-sys 31 | cargo test 32 | ``` 33 | -------------------------------------------------------------------------------- /torch-sys/src/bindings.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | include!(concat!(env!("OUT_DIR"), "/bindings.rs")); 10 | -------------------------------------------------------------------------------- /torch-sys/src/pyobject.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | use cxx::type_id; 10 | use monarch_types::TryIntoPyObject; 11 | use pyo3::prelude::*; 12 | 13 | #[repr(transparent)] 14 | pub(crate) struct FFIPyObject(*mut pyo3::ffi::PyObject); 15 | 16 | // SAFETY: This is just a pointer to a PyObject and the pointer is 17 | // never dereferenced directly. It can only be converted to pyo3::PyObject 18 | // and then dereferenced through that. PyO3 manages the access patterns to 19 | // the underlying PyObject. 20 | // Additionally, we make the assumption that ownership of the underlying 21 | // PyObject is transferred with the it. 22 | // Hence FFIPyObject should always be created from an owned pointer. 23 | unsafe impl cxx::ExternType for FFIPyObject { 24 | type Id = type_id!("monarch::FFIPyObject"); 25 | type Kind = cxx::kind::Trivial; 26 | } 27 | 28 | impl From> for FFIPyObject { 29 | #[inline] 30 | fn from(obj: Py) -> Self { 31 | Self(obj.into_ptr()) 32 | } 33 | } 34 | 35 | impl From> for FFIPyObject { 36 | #[inline] 37 | fn from(obj: Bound<'_, T>) -> Self { 38 | Self(obj.into_ptr()) 39 | } 40 | } 41 | 42 | impl From<&Bound<'_, T>> for FFIPyObject { 43 | #[inline] 44 | fn from(obj: &Bound<'_, T>) -> Self { 45 | Self(obj.clone().into_ptr()) 46 | } 47 | } 48 | 49 | impl IntoPy for FFIPyObject { 50 | #[inline] 51 | fn into_py(self, py: Python<'_>) -> PyObject { 52 | // SAFETY: Pull in the `PyObject` from C/C++. 53 | unsafe { PyObject::from_owned_ptr(py, self.0) } 54 | } 55 | } 56 | 57 | impl TryIntoPyObject for FFIPyObject { 58 | #[inline] 59 | fn try_to_object<'a>(self, py: Python<'a>) -> PyResult> { 60 | // SAFETY: Pull in the `PyObject` from C/C++. 61 | Ok(unsafe { PyObject::from_owned_ptr(py, self.0) }.into_bound(py)) 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /torch-sys/src/torch.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the BSD-style license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | // Going for a smaller set of headers until more enums are needed 12 | #include 13 | #include 14 | #include 15 | -------------------------------------------------------------------------------- /torch-sys/test_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | class Custom: 9 | pass 10 | --------------------------------------------------------------------------------