diff --git a/.github/workflows/build-cpu.yml b/.github/workflows/build-cpu.yml index 6ae2a3db1..c86787184 100644 --- a/.github/workflows/build-cpu.yml +++ b/.github/workflows/build-cpu.yml @@ -34,4 +34,7 @@ jobs: pip install -r build-requirements.txt # Build monarch (No tensor engine, CPU version) - USE_TENSOR_ENGINE=0 python setup.py bdist_wheel + USE_TENSOR_ENGINE=0 python -m build --no-isolation --wheel + + # Fix permissions for artifact upload + chmod -R 755 dist/ diff --git a/.github/workflows/build-cuda.yml b/.github/workflows/build-cuda.yml index 96fd34e8f..c618a475d 100644 --- a/.github/workflows/build-cuda.yml +++ b/.github/workflows/build-cuda.yml @@ -42,4 +42,7 @@ jobs: setup_tensor_engine # Build monarch (CUDA version) - python setup.py bdist_wheel + python -m build --no-isolation --wheel + + # Fix permissions for artifact upload + chmod -R 755 dist/ diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 495f371f6..05af26047 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -50,15 +50,18 @@ jobs: export MONARCH_PACKAGE_NAME="torchmonarch-nightly" export MONARCH_VERSION=$(date +'%Y.%m.%d') - python setup.py bdist_wheel + python -m build --no-isolation --wheel + + # Fix permissions for artifact upload + chmod -R 755 dist/ # hacky until the right distribution wheel can be made... find dist -name "*linux_x86_64.whl" -type f -exec bash -c 'mv "$1" "${1/linux_x86_64.whl/manylinux2014_x86_64.whl}"' _ {} \; ls -la dist/ # Run tests - install_python_test_dependencies pip install dist/*.whl + install_python_test_dependencies python -c "import monarch" publish: name: Publish to PyPI diff --git a/README.md b/README.md index f438fe13a..4023e4ab5 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,18 @@ **Monarch** is a distributed programming framework for PyTorch based on scalable actor messaging. It provides: -1. Remote actors with scalable messaging: Actors are grouped into collections called meshes and messages can be broadcast to all members. -2. Fault tolerance through supervision trees: Actors and processes form a tree and failures propagate up the tree, providing good default error behavior and enabling fine-grained fault recovery. -3. Point-to-point RDMA transfers: cheap registration of any GPU or CPU memory in a process, with the one-sided transfers based on libibverbs -4. Distributed tensors: actors can work with tensor objects sharded across processes - -Monarch code imperatively describes how to create processes and actors using a simple python API: +1. Remote actors with scalable messaging: Actors are grouped into collections + called meshes and messages can be broadcast to all members. +2. Fault tolerance through supervision trees: Actors and processes form a tree + and failures propagate up the tree, providing good default error behavior and + enabling fine-grained fault recovery. +3. Point-to-point RDMA transfers: cheap registration of any GPU or CPU memory in + a process, with the one-sided transfers based on libibverbs +4. Distributed tensors: actors can work with tensor objects sharded across + processes + +Monarch code imperatively describes how to create processes and actors using a +simple python API: ```python from monarch.actor import Actor, endpoint, this_host @@ -33,8 +39,9 @@ fut = trainers.train.call(step=0) fut.get() ``` - -The [introduction to monarch concepts](https://meta-pytorch.org/monarch/generated/examples/getting_started.html) provides an introduction to using these features. +The +[introduction to monarch concepts](https://meta-pytorch.org/monarch/generated/examples/getting_started.html) +provides an introduction to using these features. > ⚠️ **Early Development Warning** Monarch is currently in an experimental > stage. You should expect bugs, incomplete features, and APIs that may change @@ -45,16 +52,21 @@ The [introduction to monarch concepts](https://meta-pytorch.org/monarch/generate ## 📖 Documentation -View Monarch's hosted documentation [at this link](https://meta-pytorch.org/monarch/). +View Monarch's hosted documentation +[at this link](https://meta-pytorch.org/monarch/). ## Installation -Note for running distributed tensors and RDMA, the local torch version must match the version that monarch was built with. -Stable and nightly distributions require libmxl and libibverbs (runtime). + +Note for running distributed tensors and RDMA, the local torch version must +match the version that monarch was built with. Stable and nightly distributions +require libmxl and libibverbs (runtime). ## Fedora + `sudo dnf install -y libibverbs rdma-core libmlx5 libibverbs-devel rdma-core-devel` ## Ubuntu + `sudo apt install -y rdma-core libibverbs1 libmlx5-1 libibverbs-dev` ### Stable @@ -64,14 +76,15 @@ Stable and nightly distributions require libmxl and libibverbs (runtime). torchmonarch stable is built with the latest stable torch. ### Nightly + `pip install torchmonarch-nightly` torchmonarch-nightly is built with torch nightly. ### Build and Install from Source -If you're building Monarch from source, you should be building it with the nightly PyTorch as well for ABI compatibility. - +If you're building Monarch from source, you should be building it with the +nightly PyTorch as well for ABI compatibility. #### On Fedora distributions @@ -161,10 +174,11 @@ pip list | grep monarch #### On non-CUDA machines -You can also build Monarch to run on non-CUDA machines, e.g. locally on a MacOS system. - -Note that this does not support tensor engine, which is tied to CUDA and RDMA (via ibverbs). +You can also build Monarch to run on non-CUDA machines, e.g. locally on a MacOS +system. +Note that this does not support tensor engine, which is tied to CUDA and RDMA +(via ibverbs). ```sh @@ -180,8 +194,6 @@ rustup default nightly # Install build dependencies pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu pip install -r build-requirements.txt -# Install test dependencies -pip install -r python/tests/requirements.txt # Build and install Monarch USE_TENSOR_ENGINE=0 pip install --no-build-isolation . @@ -192,10 +204,10 @@ USE_TENSOR_ENGINE=0 pip install --no-build-isolation -e . pip list | grep monarch ``` - ## Running examples -Check out the `examples/` directory for demonstrations of how to use Monarch's APIs. +Check out the `examples/` directory for demonstrations of how to use Monarch's +APIs. We'll be adding more examples as we stabilize and polish functionality! @@ -205,6 +217,7 @@ We have both Rust and Python unit tests. Rust tests are run with `cargo-nextest` and Python tests are run with `pytest`. Rust tests: + ```sh # We use cargo-nextest to run our tests, as they can provide strong process isolation # between every test. @@ -213,12 +226,14 @@ Rust tests: cargo install cargo-nextest --locked cargo nextest run ``` + cargo-nextest supports all of the filtering flags of "cargo test". Python tests: + ```sh -# Make sure to install test dependencies first -pip install -r python/tests/requirements.txt +# Install test dependencies if not already installed +pip install -e '.[test]' # Run unit tests. consider -s for more verbose output pytest python/tests/ -v -m "not oss_skip" ``` diff --git a/build-requirements.txt b/build-requirements.txt index 11fa6deea..289440d66 100644 --- a/build-requirements.txt +++ b/build-requirements.txt @@ -1,4 +1,5 @@ -setuptools +setuptools>=64 setuptools-rust wheel numpy +build diff --git a/pyproject.toml b/pyproject.toml index 8493c4820..6f3cc5f05 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,61 @@ -[tool.pytest.ini_options] +# Build system configuration - PEP 517/518 +# Note: When using --no-build-isolation (required for wheel builds due to torch +# detection), these deps must be manually installed via build-requirements.txt. +# However, [build-system.requires] is still needed for standard editable installs +# like `pip install -e '.[test]'` which don't use --no-build-isolation. +[build-system] +requires = ["setuptools>=64", "setuptools-rust", "wheel", "numpy"] +build-backend = "setuptools.build_meta" + +[project] +name = "monarch" +version = "0.0.1" +description = "Monarch: Single controller library" +readme = "README.md" +requires-python = ">=3.10" +license = {text = "BSD-3-Clause"} +authors = [ + {name = "Meta", email = "oncall+monarch@xmail.facebook.com"} +] + +dependencies = [ + "pyzmq", + "requests", + "numpy", + "pyre-extensions", + "typing-extensions>=4.12", + "cloudpickle", + "torchx-nightly", + "lark", + "tabulate", + "opentelemetry-api", + "clusterscope", +] + +[project.optional-dependencies] +examples = [ + "bs4", + "ipython", +] +test = [ + "pytest", + "pytest-timeout", + "pytest-asyncio", + "pytest-xdist", + "pyright", +] + +[project.scripts] +monarch = "monarch.tools.cli:main" +monarch_bootstrap = "monarch._src.actor.bootstrap_main:invoke_main" +[tool.setuptools] +packages = {find = {where = ["python"], exclude = ["tests*", "tests.*"]}} + +[tool.setuptools.package-dir] +"" = "python" + +[tool.pytest.ini_options] markers = [ "oss_skip: marks tests to skip in OSS CI", ] diff --git a/python/tests/requirements.txt b/python/tests/requirements.txt deleted file mode 100644 index a560cd3de..000000000 --- a/python/tests/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -pytest -pytest-timeout -pytest-asyncio -pytest-xdist -pyright diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index e07c89c8a..000000000 --- a/requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ -pyzmq -requests -numpy -pyre-extensions -typing-extensions>=4.12 -cloudpickle -torchx-nightly -lark -tabulate -opentelemetry-api -clusterscope diff --git a/scripts/common-setup.sh b/scripts/common-setup.sh index 272da3e53..a0bd74475 100755 --- a/scripts/common-setup.sh +++ b/scripts/common-setup.sh @@ -49,7 +49,7 @@ setup_rust_toolchain() { # Install Python test dependencies install_python_test_dependencies() { echo "Installing test dependencies..." - pip install -r python/tests/requirements.txt + pip install -e '.[test]' dnf install -y rsync # required for code sync tests } diff --git a/setup.py b/setup.py index 25d4e1e18..dfbfec841 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ import sys import sysconfig -from setuptools import Command, find_packages, setup +from setuptools import Command, setup from setuptools.command.build_ext import build_ext from setuptools.extension import Extension @@ -198,12 +198,6 @@ def run(self): subprocess.run(["cargo", "clean"]) -with open("requirements.txt") as f: - reqs = f.read() - -with open("README.md", encoding="utf8") as f: - readme = f.read() - if sys.platform.startswith("linux"): # Always include the active env's lib (Conda-safe) conda_lib = os.path.join(sys.prefix, "lib") @@ -278,35 +272,10 @@ def run(self): setup( name=package_name, version=package_version, - packages=find_packages( - where="python", - exclude=["python/tests.*", "python/tests"], - ), - package_dir={"": "python"}, - python_requires=">= 3.10", - install_requires=reqs.strip().split("\n"), - extras_require={ - "examples": [ - "bs4", - "ipython", - ], - }, - license="BSD-3-Clause", - author="Meta", - author_email="oncall+monarch@xmail.facebook.com", - description="Monarch: Single controller library", - long_description=readme, - long_description_content_type="text/markdown", ext_modules=[ controller_C, common_C, ], - entry_points={ - "console_scripts": [ - "monarch=monarch.tools.cli:main", - "monarch_bootstrap=monarch._src.actor.bootstrap_main:invoke_main", - ], - }, rust_extensions=rust_extensions, cmdclass={ "build_ext": build_ext,