From 63a2766fb8aab7219938408047a9731049f00311 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Fri, 19 Dec 2025 11:21:00 -0800 Subject: [PATCH 1/7] Update [ghstack-poisoned] --- Makefile | 36 +++++++++++++++-------- examples/models/whisper/CMakePresets.json | 34 +++++++++++++++++++++ 2 files changed, 57 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index 13fc941e135..0dcfb7a5ee3 100644 --- a/Makefile +++ b/Makefile @@ -87,21 +87,22 @@ # # ============================================================================== -.PHONY: voxtral-cuda voxtral-cpu voxtral-metal whisper-cuda whisper-cpu whisper-metal llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help +.PHONY: voxtral-cuda voxtral-cpu voxtral-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help help: - @echo "This Makefile adds targets to build runners for various models on various backends. Run using `make `. Available targets:" - @echo " voxtral-cuda - Build Voxtral runner with CUDA backend" - @echo " voxtral-cpu - Build Voxtral runner with CPU backend" - @echo " voxtral-metal - Build Voxtral runner with Metal backend (macOS only)" - @echo " whisper-cuda - Build Whisper runner with CUDA backend" - @echo " whisper-cpu - Build Whisper runner with CPU backend" - @echo " whisper-metal - Build Whisper runner with Metal backend (macOS only)" - @echo " llama-cpu - Build Llama runner with CPU backend" - @echo " llava-cpu - Build Llava runner with CPU backend" - @echo " gemma3-cuda - Build Gemma3 runner with CUDA backend" - @echo " gemma3-cpu - Build Gemma3 runner with CPU backend" - @echo " clean - Clean build artifacts" + @echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make \`. Available targets:" + @echo " voxtral-cuda - Build Voxtral runner with CUDA backend" + @echo " voxtral-cpu - Build Voxtral runner with CPU backend" + @echo " voxtral-metal - Build Voxtral runner with Metal backend (macOS only)" + @echo " whisper-cuda - Build Whisper runner with CUDA backend" + @echo " whisper-cuda-debug - Build Whisper runner with CUDA backend (debug mode)" + @echo " whisper-cpu - Build Whisper runner with CPU backend" + @echo " whisper-metal - Build Whisper runner with Metal backend (macOS only)" + @echo " llama-cpu - Build Llama runner with CPU backend" + @echo " llava-cpu - Build Llava runner with CPU backend" + @echo " gemma3-cuda - Build Gemma3 runner with CUDA backend" + @echo " gemma3-cpu - Build Gemma3 runner with CPU backend" + @echo " clean - Clean build artifacts" voxtral-cuda: @echo "==> Building and installing ExecuTorch with CUDA..." @@ -139,6 +140,15 @@ whisper-cuda: @echo "✓ Build complete!" @echo " Binary: cmake-out/examples/models/whisper/whisper_runner" +whisper-cuda-debug: + @echo "==> Building and installing ExecuTorch with CUDA (debug mode)..." + cmake --workflow --preset llm-debug-cuda + @echo "==> Building Whisper runner with CUDA (debug mode)..." + cd examples/models/whisper && cmake --workflow --preset whisper-cuda-debug + @echo "" + @echo "✓ Build complete!" + @echo " Binary: cmake-out/examples/models/whisper/whisper_runner" + whisper-cpu: @echo "==> Building and installing ExecuTorch..." cmake --workflow --preset llm-release diff --git a/examples/models/whisper/CMakePresets.json b/examples/models/whisper/CMakePresets.json index 1419788cb0c..a5bfc2be5a4 100644 --- a/examples/models/whisper/CMakePresets.json +++ b/examples/models/whisper/CMakePresets.json @@ -29,6 +29,20 @@ "list": ["Linux", "Windows"] } }, + { + "name": "whisper-cuda-debug", + "displayName": "Whisper runner (CUDA Debug)", + "inherits": ["whisper-base"], + "cacheVariables": { + "EXECUTORCH_BUILD_CUDA": "ON", + "CMAKE_BUILD_TYPE": "Debug" + }, + "condition": { + "type": "inList", + "string": "${hostSystemName}", + "list": ["Linux", "Windows"] + } + }, { "name": "whisper-metal", "displayName": "Whisper runner (Metal)", @@ -56,6 +70,12 @@ "configurePreset": "whisper-cuda", "targets": ["whisper_runner"] }, + { + "name": "whisper-cuda-debug", + "displayName": "Build Whisper runner (CUDA Debug)", + "configurePreset": "whisper-cuda-debug", + "targets": ["whisper_runner"] + }, { "name": "whisper-metal", "displayName": "Build Whisper runner (Metal)", @@ -92,6 +112,20 @@ } ] }, + { + "name": "whisper-cuda-debug", + "displayName": "Configure and build Whisper runner (CUDA Debug)", + "steps": [ + { + "type": "configure", + "name": "whisper-cuda-debug" + }, + { + "type": "build", + "name": "whisper-cuda-debug" + } + ] + }, { "name": "whisper-metal", "displayName": "Configure and build Whisper runner (Metal)", From f02dbe10d2d2fcf5f52e79660fc0819087e47e4c Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Fri, 19 Dec 2025 11:21:04 -0800 Subject: [PATCH 2/7] Update [ghstack-poisoned] --- .github/workflows/cuda.yml | 26 ++++ backends/cuda/runtime/shims/memory.cpp | 30 +++- .../cuda/runtime/shims/tests/CMakeLists.txt | 60 ++++++++ .../runtime/shims/tests/CMakePresets.json | 88 ++++++++++++ backends/cuda/runtime/shims/tests/README.md | 132 ++++++++++++++++++ ..._aoti_torch_create_tensor_from_blob_v2.cpp | 65 +++++++++ 6 files changed, 399 insertions(+), 2 deletions(-) create mode 100644 backends/cuda/runtime/shims/tests/CMakeLists.txt create mode 100644 backends/cuda/runtime/shims/tests/CMakePresets.json create mode 100644 backends/cuda/runtime/shims/tests/README.md diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 265bc252ee8..904efb4e52f 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -87,6 +87,32 @@ jobs: export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda + test-cuda-shims: + name: test-cuda-shims + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + with: + timeout: 90 + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: 12.6 + use-custom-docker-registry: false + submodules: recursive + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + + # Build ExecuTorch with CUDA support + ./install_executorch.sh + + # Build and run CUDA shim tests + pushd backends/cuda/runtime/shims/tests + cmake --workflow --preset default + ctest --preset default --output-on-failure + popd + export-model-cuda-artifact: name: export-model-cuda-artifact # Skip this job if the pull request is from a fork (HuggingFace secrets are not available) diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp index aaaf3913381..b6310b0ee5a 100644 --- a/backends/cuda/runtime/shims/memory.cpp +++ b/backends/cuda/runtime/shims/memory.cpp @@ -119,8 +119,6 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2( int32_t layout, const uint8_t* opaque_metadata, int64_t opaque_metadata_size) { - // TODO(gasoonjia): verify given data is on the target device - (void)device_type; (void)opaque_metadata; (void)layout; (void)opaque_metadata_size; @@ -154,6 +152,34 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2( // Storage offset must be 0 since from_blob cannot handle different offsets ET_CHECK_OK_OR_RETURN_ERROR(validate_storage_offset(storage_offset)); + // Verify that data pointer location matches the requested device_type + cudaPointerAttributes data_attributes{}; + ET_CUDA_CHECK_OR_RETURN_ERROR( + cudaPointerGetAttributes(&data_attributes, data)); + + bool data_is_on_device = data_attributes.type == cudaMemoryTypeDevice; + bool data_is_on_host = data_attributes.type == cudaMemoryTypeHost || + data_attributes.type == cudaMemoryTypeUnregistered; + bool requested_device = + device_type == static_cast(SupportedDevices::CUDA); + bool requested_cpu = + device_type == static_cast(SupportedDevices::CPU); + + // Error if data location doesn't match requested device type + ET_CHECK_OR_RETURN_ERROR( + !(data_is_on_device && requested_cpu), + InvalidArgument, + "aoti_torch_create_tensor_from_blob_v2 failed: data pointer %p is on CUDA " + "but device_type is CPU. Data must be on CPU for CPU tensors.", + data); + + ET_CHECK_OR_RETURN_ERROR( + !(data_is_on_host && requested_device), + InvalidArgument, + "aoti_torch_create_tensor_from_blob_v2 failed: data pointer %p is on CPU " + "but device_type is CUDA. Data must be on GPU for CUDA tensors.", + data); + // Convert sizes to the format expected by ExecutorTorch using SizesType std::vector sizes = convert_sizes_to_vector(ndim, sizes_ptr); diff --git a/backends/cuda/runtime/shims/tests/CMakeLists.txt b/backends/cuda/runtime/shims/tests/CMakeLists.txt new file mode 100644 index 00000000000..f626e067627 --- /dev/null +++ b/backends/cuda/runtime/shims/tests/CMakeLists.txt @@ -0,0 +1,60 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +cmake_minimum_required(VERSION 3.19) +project(aoti_cuda_shim_tests LANGUAGES CXX CUDA) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Find required packages +find_package(CUDAToolkit REQUIRED) +find_package(GTest REQUIRED) + +# Get EXECUTORCH_ROOT +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..) +endif() + +# Find installed ExecuTorch +find_package(executorch CONFIG REQUIRED HINTS ${CMAKE_INSTALL_PREFIX}) + +# List of test files +set(CUDA_SHIM_TESTS + test_aoti_torch_create_tensor_from_blob_v2 + test_aoti_torch_empty_strided + test_aoti_torch_delete_tensor_object + test_aoti_torch__reinterpret_tensor + test_aoti_torch_copy_ + test_aoti_torch_new_tensor_handle +) + +enable_testing() + +foreach(test_name ${CUDA_SHIM_TESTS}) + add_executable(${test_name} ${test_name}.cpp) + + target_include_directories( + ${test_name} + PRIVATE ${EXECUTORCH_ROOT} + ${CUDAToolkit_INCLUDE_DIRS} + ) + + target_link_libraries( + ${test_name} + PRIVATE GTest::gtest + GTest::gtest_main + aoti_cuda_shims + aoti_cuda_backend + cuda_tensor_maker + cuda_platform + executorch_core + extension_tensor + CUDA::cudart + ) + + add_test(NAME ${test_name} COMMAND ${test_name}) +endforeach() diff --git a/backends/cuda/runtime/shims/tests/CMakePresets.json b/backends/cuda/runtime/shims/tests/CMakePresets.json new file mode 100644 index 00000000000..f4e4ee7253e --- /dev/null +++ b/backends/cuda/runtime/shims/tests/CMakePresets.json @@ -0,0 +1,88 @@ +{ + "version": 6, + "configurePresets": [ + { + "name": "default", + "displayName": "CUDA Shim Tests", + "binaryDir": "${sourceDir}/../../../../../cmake-out/backends/cuda/runtime/shims/tests", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release", + "CMAKE_PREFIX_PATH": "${sourceDir}/../../../../../cmake-out" + }, + "condition": { + "type": "inList", + "string": "${hostSystemName}", + "list": ["Linux", "Windows"] + } + }, + { + "name": "debug", + "displayName": "CUDA Shim Tests (Debug)", + "inherits": ["default"], + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Debug" + } + } + ], + "buildPresets": [ + { + "name": "default", + "displayName": "Build CUDA Shim Tests", + "configurePreset": "default" + }, + { + "name": "debug", + "displayName": "Build CUDA Shim Tests (Debug)", + "configurePreset": "debug" + } + ], + "workflowPresets": [ + { + "name": "default", + "displayName": "Configure and build CUDA Shim Tests", + "steps": [ + { + "type": "configure", + "name": "default" + }, + { + "type": "build", + "name": "default" + } + ] + }, + { + "name": "debug", + "displayName": "Configure and build CUDA Shim Tests (Debug)", + "steps": [ + { + "type": "configure", + "name": "debug" + }, + { + "type": "build", + "name": "debug" + } + ] + } + ], + "testPresets": [ + { + "name": "default", + "displayName": "Run all CUDA Shim Tests", + "configurePreset": "default", + "output": { + "outputOnFailure": true + } + }, + { + "name": "debug", + "displayName": "Run all CUDA Shim Tests (Debug)", + "configurePreset": "debug", + "output": { + "outputOnFailure": true + } + } + ] +} + diff --git a/backends/cuda/runtime/shims/tests/README.md b/backends/cuda/runtime/shims/tests/README.md new file mode 100644 index 00000000000..2c4b91570b0 --- /dev/null +++ b/backends/cuda/runtime/shims/tests/README.md @@ -0,0 +1,132 @@ +# CUDA AOTI Shim Tests + +Unit tests for the CUDA AOTI (Ahead-Of-Time Inductor) shim functions used by the ExecuTorch CUDA backend. + +## Prerequisites + +1. **CUDA Toolkit**: Ensure CUDA is installed and available +2. **ExecuTorch with CUDA**: Build and install ExecuTorch with CUDA support first + +## Building ExecuTorch with CUDA + +From the ExecuTorch root directory: + +```bash +# Release build +cmake --workflow --preset llm-release-cuda + +# Or debug build (recommended for debugging test failures) +cmake --workflow --preset llm-debug-cuda +``` + +## Building the Tests + +### Option 1: Using CMake Presets (Recommended) + +From this directory (`backends/cuda/runtime/shims/tests/`): + +```bash +# Release build +cmake --workflow --preset default + +# Debug build +cmake --workflow --preset debug +``` + +### Option 2: Manual CMake Commands + +From the ExecuTorch root directory: + +```bash +# Configure +cmake -B cmake-out/backends/cuda/runtime/shims/tests \ + -S backends/cuda/runtime/shims/tests \ + -DCMAKE_PREFIX_PATH=$(pwd)/cmake-out \ + -DCMAKE_BUILD_TYPE=Debug + +# Build +cmake --build cmake-out/backends/cuda/runtime/shims/tests -j$(nproc) +``` + +## Running the Tests + +### Run All Tests + +```bash +# Using ctest (from the build directory) +cd cmake-out/backends/cuda/runtime/shims/tests +ctest --output-on-failure + +# Or using the test preset (from this directory) +ctest --preset default +``` + +### Run a Specific Test + +```bash +# From the build directory +./test_aoti_torch_create_tensor_from_blob_v2 +./test_aoti_torch_empty_strided +./test_aoti_torch_delete_tensor_object +./test_aoti_torch_copy_ +./test_aoti_torch_new_tensor_handle +./test_aoti_torch_item_bool +./test_aoti_torch_assign_tensors_out +``` + +### Run Specific Test Cases + +Use Google Test filters to run specific test cases: + +```bash +# Run only device mismatch tests +./test_aoti_torch_create_tensor_from_blob_v2 --gtest_filter="*DeviceMismatch*" + +# Run a single test +./test_aoti_torch_create_tensor_from_blob_v2 --gtest_filter="AOTITorchCreateTensorFromBlobV2Test.BasicFunctionalityCUDA" + +# List all available tests +./test_aoti_torch_create_tensor_from_blob_v2 --gtest_list_tests +``` + +## Test Descriptions + +| Test File | Description | +|-----------|-------------| +| `test_aoti_torch_create_tensor_from_blob_v2` | Tests tensor creation from existing memory blobs, including device type validation | +| `test_aoti_torch_empty_strided` | Tests creation of uninitialized tensors with specified strides | +| `test_aoti_torch_delete_tensor_object` | Tests proper tensor deletion and memory management | +| `test_aoti_torch__reinterpret_tensor` | Tests tensor view reinterpretation with different shapes/strides | +| `test_aoti_torch_copy_` | Tests tensor copy operations between CPU and CUDA | +| `test_aoti_torch_new_tensor_handle` | Tests creating new tensor handles that share memory | +| `test_aoti_torch_item_bool` | Tests extracting boolean values from scalar tensors | +| `test_aoti_torch_assign_tensors_out` | Tests creating tensor views that share underlying data | + +## Troubleshooting + +### CUDA Not Available + +If tests are skipped with "CUDA not available", ensure: +- CUDA drivers are installed +- A CUDA-capable GPU is present +- `nvidia-smi` shows the GPU + +### Link Errors + +If you get link errors, ensure ExecuTorch was built with CUDA support: +```bash +cmake --workflow --preset llm-release-cuda +``` + +### Test Failures + +For debugging test failures, build with debug mode: +```bash +cmake --workflow --preset debug +``` + +Then run with verbose output: +```bash +./test_aoti_torch_create_tensor_from_blob_v2 --gtest_break_on_failure +``` + diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp index d9b785a5a78..db0ab84970d 100644 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp @@ -752,3 +752,68 @@ TEST_F(AOTITorchCreateTensorFromBlobV2Test, StressTestManySmallTensors) { EXPECT_EQ(error, Error::Ok); } } + +// Test device type mismatch: CPU data with CUDA device request should fail +TEST_F(AOTITorchCreateTensorFromBlobV2Test, DeviceMismatchCPUDataCUDADevice) { + std::vector sizes = {2, 3}; + std::vector strides = calculate_contiguous_strides(sizes); + + // Allocate CPU memory + size_t bytes = calculate_numel(sizes) * sizeof(float); + void* cpu_data = allocate_cpu_memory(bytes); + ASSERT_NE(cpu_data, nullptr); + + Tensor* tensor; + // Request CUDA device but provide CPU memory - should fail + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + cpu_data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), // Request CUDA + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::InvalidArgument) + << "Should fail when CPU data is provided but CUDA device is requested"; +} + +// Test device type mismatch: CUDA data with CPU device request should fail +TEST_F(AOTITorchCreateTensorFromBlobV2Test, DeviceMismatchCUDADataCPUDevice) { + std::vector sizes = {2, 3}; + std::vector strides = calculate_contiguous_strides(sizes); + + // Allocate CUDA memory (device memory, not managed) + size_t bytes = calculate_numel(sizes) * sizeof(float); + void* cuda_data = nullptr; + cudaError_t cuda_err = cudaMalloc(&cuda_data, bytes); + ASSERT_EQ(cuda_err, cudaSuccess); + ASSERT_NE(cuda_data, nullptr); + + Tensor* tensor; + // Request CPU device but provide CUDA memory - should fail + AOTITorchError error = aoti_torch_create_tensor_from_blob_v2( + cuda_data, + sizes.size(), + sizes.data(), + strides.data(), + 0, // storage_offset + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CPU), // Request CPU + 0, // device index + &tensor, + 0, // layout + nullptr, // opaque_metadata + 0); // opaque_metadata_size + + EXPECT_EQ(error, Error::InvalidArgument) + << "Should fail when CUDA data is provided but CPU device is requested"; + + // Clean up the CUDA memory we allocated directly + cudaFree(cuda_data); +} From a97933b8acc62a5f8fa6ca837a1b4675228c7806 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Mon, 22 Dec 2025 08:00:16 +0000 Subject: [PATCH 3/7] Update [ghstack-poisoned] --- .../cuda/runtime/shims/tests/CMakeLists.txt | 20 ++++--- .../runtime/shims/tests/CMakePresets.json | 1 - backends/cuda/runtime/shims/tests/README.md | 54 +++---------------- 3 files changed, 17 insertions(+), 58 deletions(-) diff --git a/backends/cuda/runtime/shims/tests/CMakeLists.txt b/backends/cuda/runtime/shims/tests/CMakeLists.txt index 9c367ab8e1a..a7df6075c37 100644 --- a/backends/cuda/runtime/shims/tests/CMakeLists.txt +++ b/backends/cuda/runtime/shims/tests/CMakeLists.txt @@ -21,7 +21,10 @@ FetchContent_Declare( GIT_TAG v1.14.0 ) # For Windows: Prevent overriding the parent project's compiler/linker settings -set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) +set(gtest_force_shared_crt + ON + CACHE BOOL "" FORCE +) FetchContent_MakeAvailable(googletest) # Get EXECUTORCH_ROOT @@ -34,12 +37,9 @@ find_package(executorch CONFIG REQUIRED HINTS ${CMAKE_INSTALL_PREFIX}) # List of test files set(CUDA_SHIM_TESTS - test_aoti_torch_create_tensor_from_blob_v2 - test_aoti_torch_empty_strided - test_aoti_torch_delete_tensor_object - test_aoti_torch__reinterpret_tensor - test_aoti_torch_copy_ - test_aoti_torch_new_tensor_handle + test_aoti_torch_create_tensor_from_blob_v2 test_aoti_torch_empty_strided + test_aoti_torch_delete_tensor_object test_aoti_torch__reinterpret_tensor + test_aoti_torch_copy_ test_aoti_torch_new_tensor_handle ) enable_testing() @@ -48,10 +48,8 @@ foreach(test_name ${CUDA_SHIM_TESTS}) add_executable(${test_name} ${test_name}.cpp) target_include_directories( - ${test_name} - PRIVATE ${EXECUTORCH_ROOT}/.. - ${EXECUTORCH_ROOT} - ${CUDAToolkit_INCLUDE_DIRS} + ${test_name} PRIVATE ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT} + ${CUDAToolkit_INCLUDE_DIRS} ) target_link_libraries( diff --git a/backends/cuda/runtime/shims/tests/CMakePresets.json b/backends/cuda/runtime/shims/tests/CMakePresets.json index 40e0dbbd698..33b448f1538 100644 --- a/backends/cuda/runtime/shims/tests/CMakePresets.json +++ b/backends/cuda/runtime/shims/tests/CMakePresets.json @@ -93,4 +93,3 @@ } ] } - diff --git a/backends/cuda/runtime/shims/tests/README.md b/backends/cuda/runtime/shims/tests/README.md index 2c4b91570b0..1456447eeba 100644 --- a/backends/cuda/runtime/shims/tests/README.md +++ b/backends/cuda/runtime/shims/tests/README.md @@ -13,13 +13,13 @@ From the ExecuTorch root directory: ```bash # Release build -cmake --workflow --preset llm-release-cuda +cmake --workflow llm-release-cuda # Or debug build (recommended for debugging test failures) -cmake --workflow --preset llm-debug-cuda +cmake --workflow llm-debug-cuda ``` -## Building the Tests +## Building and Run the Tests ### Option 1: Using CMake Presets (Recommended) @@ -27,10 +27,10 @@ From this directory (`backends/cuda/runtime/shims/tests/`): ```bash # Release build -cmake --workflow --preset default +cmake --workflow default # Debug build -cmake --workflow --preset debug +cmake --workflow debug ``` ### Option 2: Manual CMake Commands @@ -48,37 +48,13 @@ cmake -B cmake-out/backends/cuda/runtime/shims/tests \ cmake --build cmake-out/backends/cuda/runtime/shims/tests -j$(nproc) ``` -## Running the Tests - -### Run All Tests - -```bash -# Using ctest (from the build directory) -cd cmake-out/backends/cuda/runtime/shims/tests -ctest --output-on-failure - -# Or using the test preset (from this directory) -ctest --preset default -``` - -### Run a Specific Test - -```bash -# From the build directory -./test_aoti_torch_create_tensor_from_blob_v2 -./test_aoti_torch_empty_strided -./test_aoti_torch_delete_tensor_object -./test_aoti_torch_copy_ -./test_aoti_torch_new_tensor_handle -./test_aoti_torch_item_bool -./test_aoti_torch_assign_tensors_out -``` - ### Run Specific Test Cases Use Google Test filters to run specific test cases: ```bash +# From the build directory +cd cmake-out/backends/cuda/runtime/shims/tests # Run only device mismatch tests ./test_aoti_torch_create_tensor_from_blob_v2 --gtest_filter="*DeviceMismatch*" @@ -89,19 +65,6 @@ Use Google Test filters to run specific test cases: ./test_aoti_torch_create_tensor_from_blob_v2 --gtest_list_tests ``` -## Test Descriptions - -| Test File | Description | -|-----------|-------------| -| `test_aoti_torch_create_tensor_from_blob_v2` | Tests tensor creation from existing memory blobs, including device type validation | -| `test_aoti_torch_empty_strided` | Tests creation of uninitialized tensors with specified strides | -| `test_aoti_torch_delete_tensor_object` | Tests proper tensor deletion and memory management | -| `test_aoti_torch__reinterpret_tensor` | Tests tensor view reinterpretation with different shapes/strides | -| `test_aoti_torch_copy_` | Tests tensor copy operations between CPU and CUDA | -| `test_aoti_torch_new_tensor_handle` | Tests creating new tensor handles that share memory | -| `test_aoti_torch_item_bool` | Tests extracting boolean values from scalar tensors | -| `test_aoti_torch_assign_tensors_out` | Tests creating tensor views that share underlying data | - ## Troubleshooting ### CUDA Not Available @@ -122,11 +85,10 @@ cmake --workflow --preset llm-release-cuda For debugging test failures, build with debug mode: ```bash -cmake --workflow --preset debug +cmake --workflow debug ``` Then run with verbose output: ```bash ./test_aoti_torch_create_tensor_from_blob_v2 --gtest_break_on_failure ``` - From e1bb6c241cc008b96e63689b43c194521022fe80 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Mon, 22 Dec 2025 17:40:55 +0000 Subject: [PATCH 4/7] Update [ghstack-poisoned] --- backends/cuda/runtime/shims/tests/README.md | 10 +-- extension/llm/custom_ops/custom_ops.py | 86 +++++++++++++++++++++ 2 files changed, 91 insertions(+), 5 deletions(-) diff --git a/backends/cuda/runtime/shims/tests/README.md b/backends/cuda/runtime/shims/tests/README.md index 1456447eeba..3e95280811b 100644 --- a/backends/cuda/runtime/shims/tests/README.md +++ b/backends/cuda/runtime/shims/tests/README.md @@ -13,10 +13,10 @@ From the ExecuTorch root directory: ```bash # Release build -cmake --workflow llm-release-cuda +cmake --workflow --preset llm-release-cuda # Or debug build (recommended for debugging test failures) -cmake --workflow llm-debug-cuda +cmake --workflow --preset llm-debug-cuda ``` ## Building and Run the Tests @@ -27,10 +27,10 @@ From this directory (`backends/cuda/runtime/shims/tests/`): ```bash # Release build -cmake --workflow default +cmake --workflow --preset default # Debug build -cmake --workflow debug +cmake --workflow --preset debug ``` ### Option 2: Manual CMake Commands @@ -85,7 +85,7 @@ cmake --workflow --preset llm-release-cuda For debugging test failures, build with debug mode: ```bash -cmake --workflow debug +cmake --workflow --preset debug ``` Then run with verbose output: diff --git a/extension/llm/custom_ops/custom_ops.py b/extension/llm/custom_ops/custom_ops.py index dfa357fe356..51c9601454a 100644 --- a/extension/llm/custom_ops/custom_ops.py +++ b/extension/llm/custom_ops/custom_ops.py @@ -16,6 +16,12 @@ from torch.library import impl +from typing import Tuple + +from torch._inductor.lowering import lowerings as L, register_lowering + +aten = torch.ops.aten + try: op = torch.ops.llama.sdpa_with_kv_cache.default assert op is not None @@ -387,3 +393,83 @@ def custom_quantized_sdpa_meta( ) return torch.empty(query.size(), dtype=torch.float32, device="meta") + +# 1) Define the custom op in the "executorch" namespace with name "alias" +@torch.library.custom_op("executorch::alias", mutates_args=()) +def custom_alias(x: torch.Tensor, y: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + # no copies, just pass-through + return x, y + + +# 2) FakeTensor kernel: describes output metadata for compile-time +@custom_alias.register_fake +def _(x, y): + # For this op, outputs have exactly the same shape/dtype/device as inputs. + # We just need *dummy* tensors with that metadata. + out_x = torch.empty_like(x) + out_y = torch.empty_like(y) + return out_x, out_y + + +@register_lowering(torch.ops.executorch.alias.default) +def lowering_custom_alias(x, y): + # x, y here are IR values (Inductor's internal representation). + # Alias is logically a no-op – just pass them through. + return x, y + + +# Expecting cache shape: (B, H, S_max, D), value shape (B, H, S, D) where S <= S_max +def _validate_cross_attn_cache_params(value: torch.Tensor, cache: torch.Tensor): + torch._assert(value.dim() == 4, "value must be 4D") + torch._assert(cache.dim() == 4, "cache must be 4D") + # Cache shape: (B, H, S_max, D) + # Value shape: (B, H, S, D) + torch._assert( + value.size(2) <= cache.size(2), + f"value sequence length {value.size(2)} exceeds cache size {cache.size(2)}", + ) + torch._assert(value.size(0) == cache.size(0), "batch size mismatch") + torch._assert(value.size(1) == cache.size(1), "num heads mismatch") + torch._assert(value.size(3) == cache.size(3), "head dim mismatch") + torch._assert(value.dtype == cache.dtype, "dtype mismatch") + + +# This is cheating: we delibrately NOT mark `cache` to be mutating so that this +# custom op can be used in HOP such as `torch.cond`, where `torch.compile` requires +# no aliasing or mutation in the branches. This is fine because we only care about inference. +@torch.library.custom_op("executorch::update_cross_attn_cache", mutates_args=[]) +def _update_cross_attn_cache(value: torch.Tensor, cache: torch.Tensor) -> torch.Tensor: + # Eager implementation + _validate_cross_attn_cache_params(value, cache) + + # Slice the cache to match value's sequence length and copy + # cache shape: [B, H, S_max, D] + # value shape: [B, H, S, D] + cache[:, :, : value.size(2), :].copy_(value) + return cache + + +# Register the fake (meta) kernel +@_update_cross_attn_cache.register_fake +def _update_cross_attn_cache_fake( + value: torch.Tensor, cache: torch.Tensor +) -> torch.Tensor: + _validate_cross_attn_cache_params(value, cache) + return torch.empty_like(cache) + + +# Register Inductor lowering +@register_lowering(torch.ops.executorch.update_cross_attn_cache) +def _update_cross_attn_cache_lowering(value, cache): + # cache shape: [B, H, S_max, D] + # value shape: [B, H, S, D] + + # We need to slice the cache along dim 2 (sequence length) + # slice(self, dim, start, end, step=1) + seq_len = value.get_size()[2] + cache_slice = L[aten.slice.Tensor](cache, 2, 0, seq_len, 1) + + # Copy value into the slice + L[aten.copy_.default](cache_slice, value) + + return cache \ No newline at end of file From 2a7a9f0d96c974adfcf9dd552ec88b92540736f1 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Mon, 22 Dec 2025 17:43:43 +0000 Subject: [PATCH 5/7] Update [ghstack-poisoned] --- extension/llm/custom_ops/custom_ops.py | 86 -------------------------- 1 file changed, 86 deletions(-) diff --git a/extension/llm/custom_ops/custom_ops.py b/extension/llm/custom_ops/custom_ops.py index 51c9601454a..dfa357fe356 100644 --- a/extension/llm/custom_ops/custom_ops.py +++ b/extension/llm/custom_ops/custom_ops.py @@ -16,12 +16,6 @@ from torch.library import impl -from typing import Tuple - -from torch._inductor.lowering import lowerings as L, register_lowering - -aten = torch.ops.aten - try: op = torch.ops.llama.sdpa_with_kv_cache.default assert op is not None @@ -393,83 +387,3 @@ def custom_quantized_sdpa_meta( ) return torch.empty(query.size(), dtype=torch.float32, device="meta") - -# 1) Define the custom op in the "executorch" namespace with name "alias" -@torch.library.custom_op("executorch::alias", mutates_args=()) -def custom_alias(x: torch.Tensor, y: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - # no copies, just pass-through - return x, y - - -# 2) FakeTensor kernel: describes output metadata for compile-time -@custom_alias.register_fake -def _(x, y): - # For this op, outputs have exactly the same shape/dtype/device as inputs. - # We just need *dummy* tensors with that metadata. - out_x = torch.empty_like(x) - out_y = torch.empty_like(y) - return out_x, out_y - - -@register_lowering(torch.ops.executorch.alias.default) -def lowering_custom_alias(x, y): - # x, y here are IR values (Inductor's internal representation). - # Alias is logically a no-op – just pass them through. - return x, y - - -# Expecting cache shape: (B, H, S_max, D), value shape (B, H, S, D) where S <= S_max -def _validate_cross_attn_cache_params(value: torch.Tensor, cache: torch.Tensor): - torch._assert(value.dim() == 4, "value must be 4D") - torch._assert(cache.dim() == 4, "cache must be 4D") - # Cache shape: (B, H, S_max, D) - # Value shape: (B, H, S, D) - torch._assert( - value.size(2) <= cache.size(2), - f"value sequence length {value.size(2)} exceeds cache size {cache.size(2)}", - ) - torch._assert(value.size(0) == cache.size(0), "batch size mismatch") - torch._assert(value.size(1) == cache.size(1), "num heads mismatch") - torch._assert(value.size(3) == cache.size(3), "head dim mismatch") - torch._assert(value.dtype == cache.dtype, "dtype mismatch") - - -# This is cheating: we delibrately NOT mark `cache` to be mutating so that this -# custom op can be used in HOP such as `torch.cond`, where `torch.compile` requires -# no aliasing or mutation in the branches. This is fine because we only care about inference. -@torch.library.custom_op("executorch::update_cross_attn_cache", mutates_args=[]) -def _update_cross_attn_cache(value: torch.Tensor, cache: torch.Tensor) -> torch.Tensor: - # Eager implementation - _validate_cross_attn_cache_params(value, cache) - - # Slice the cache to match value's sequence length and copy - # cache shape: [B, H, S_max, D] - # value shape: [B, H, S, D] - cache[:, :, : value.size(2), :].copy_(value) - return cache - - -# Register the fake (meta) kernel -@_update_cross_attn_cache.register_fake -def _update_cross_attn_cache_fake( - value: torch.Tensor, cache: torch.Tensor -) -> torch.Tensor: - _validate_cross_attn_cache_params(value, cache) - return torch.empty_like(cache) - - -# Register Inductor lowering -@register_lowering(torch.ops.executorch.update_cross_attn_cache) -def _update_cross_attn_cache_lowering(value, cache): - # cache shape: [B, H, S_max, D] - # value shape: [B, H, S, D] - - # We need to slice the cache along dim 2 (sequence length) - # slice(self, dim, start, end, step=1) - seq_len = value.get_size()[2] - cache_slice = L[aten.slice.Tensor](cache, 2, 0, seq_len, 1) - - # Copy value into the slice - L[aten.copy_.default](cache_slice, value) - - return cache \ No newline at end of file From 8b9408775df14d8adee82a8a0f9e8fbd849dee52 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Mon, 22 Dec 2025 19:54:40 +0000 Subject: [PATCH 6/7] Update [ghstack-poisoned] --- .github/workflows/cuda.yml | 4 ++-- backends/cuda/runtime/shims/memory.cpp | 8 +++----- .../shims/tests/test_aoti_torch__reinterpret_tensor.cpp | 6 ------ .../tests/test_aoti_torch_create_tensor_from_blob_v2.cpp | 6 ------ 4 files changed, 5 insertions(+), 19 deletions(-) diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 0dbee174bd3..d12ab2df73e 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -105,11 +105,11 @@ jobs: set -eux # Build ExecuTorch with CUDA support - cmake --workflow llm-release-cuda + cmake --workflow --preset llm-release-cuda # Build and run CUDA shim tests pushd backends/cuda/runtime/shims/tests - cmake --workflow default + cmake --workflow --preset default popd export-model-cuda-artifact: diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp index 7599bd19b05..ecb1ded2f39 100644 --- a/backends/cuda/runtime/shims/memory.cpp +++ b/backends/cuda/runtime/shims/memory.cpp @@ -331,12 +331,10 @@ void clear_all_tensors() { // tensors set should now be empty, but ensure it's cleared tensors.clear(); - ET_LOG(Info, "Cleared all tensors"); -} - -void clear_memory_tracking() { + // Clear memory tracking map (includes leftover NOT_OWN entries) memory_to_n_tensor.clear(); - ET_LOG(Info, "Cleared memory tracking map"); + + ET_LOG(Info, "Cleared all tensors and memory tracking"); } AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) { diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp index 2e646fb9a66..d3044810b15 100644 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp @@ -42,9 +42,6 @@ class AOTITorchReinterpretTensorTest : public ::testing::Test { // Clear any remaining tensors from previous tests clear_all_tensors(); - - // Clear memory tracking map - clear_memory_tracking(); } void TearDown() override { @@ -53,9 +50,6 @@ class AOTITorchReinterpretTensorTest : public ::testing::Test { // Clear the global tensor storage using the provided function clear_all_tensors(); - - // Clear memory tracking map - clear_memory_tracking(); } // Helper to calculate number of elements from sizes diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp index ce9c2bb633d..db0ab84970d 100644 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp @@ -40,9 +40,6 @@ class AOTITorchCreateTensorFromBlobV2Test : public ::testing::Test { // Clear any remaining tensors from previous tests clear_all_tensors(); - - // Clear memory tracking map - clear_memory_tracking(); } void TearDown() override { @@ -52,9 +49,6 @@ class AOTITorchCreateTensorFromBlobV2Test : public ::testing::Test { // Clear the global tensor storage using the provided function clear_all_tensors(); - // Clear memory tracking map - clear_memory_tracking(); - // Clean up any allocated memory buffers for (void* ptr : cuda_memory_buffers_) { if (ptr) { From 73efe126558e91f2c784847d1f7318a74b235b0a Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Mon, 22 Dec 2025 23:36:29 +0000 Subject: [PATCH 7/7] Update [ghstack-poisoned] --- .github/workflows/cuda.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index d12ab2df73e..dbce874cfc0 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -103,6 +103,8 @@ jobs: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} script: | set -eux + # Install requirements + bash ./install_requirements.sh # Build ExecuTorch with CUDA support cmake --workflow --preset llm-release-cuda