From 63a2766fb8aab7219938408047a9731049f00311 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Fri, 19 Dec 2025 11:21:00 -0800
Subject: [PATCH 1/7] Update

[ghstack-poisoned]
---
 Makefile                                  | 36 +++++++++++++++--------
 examples/models/whisper/CMakePresets.json | 34 +++++++++++++++++++++
 2 files changed, 57 insertions(+), 13 deletions(-)
diff --git a/Makefile b/Makefile
index 13fc941e135..0dcfb7a5ee3 100644
--- a/Makefile
+++ b/Makefile
@@ -87,21 +87,22 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal whisper-cuda whisper-cpu whisper-metal llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
 
 help:
-	@echo "This Makefile adds targets to build runners for various models on various backends. Run using `make <target>`. Available targets:"
-	@echo "  voxtral-cuda   - Build Voxtral runner with CUDA backend"
-	@echo "  voxtral-cpu    - Build Voxtral runner with CPU backend"
-	@echo "  voxtral-metal  - Build Voxtral runner with Metal backend (macOS only)"
-	@echo "  whisper-cuda   - Build Whisper runner with CUDA backend"
-	@echo "  whisper-cpu    - Build Whisper runner with CPU backend"
-	@echo "  whisper-metal  - Build Whisper runner with Metal backend (macOS only)"
-	@echo "  llama-cpu      - Build Llama runner with CPU backend"
-	@echo "  llava-cpu      - Build Llava runner with CPU backend"
-	@echo "  gemma3-cuda    - Build Gemma3 runner with CUDA backend"
-	@echo "  gemma3-cpu     - Build Gemma3 runner with CPU backend"
-	@echo "  clean          - Clean build artifacts"
+	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
+	@echo "  voxtral-cuda        - Build Voxtral runner with CUDA backend"
+	@echo "  voxtral-cpu         - Build Voxtral runner with CPU backend"
+	@echo "  voxtral-metal       - Build Voxtral runner with Metal backend (macOS only)"
+	@echo "  whisper-cuda        - Build Whisper runner with CUDA backend"
+	@echo "  whisper-cuda-debug  - Build Whisper runner with CUDA backend (debug mode)"
+	@echo "  whisper-cpu         - Build Whisper runner with CPU backend"
+	@echo "  whisper-metal       - Build Whisper runner with Metal backend (macOS only)"
+	@echo "  llama-cpu           - Build Llama runner with CPU backend"
+	@echo "  llava-cpu           - Build Llava runner with CPU backend"
+	@echo "  gemma3-cuda         - Build Gemma3 runner with CUDA backend"
+	@echo "  gemma3-cpu          - Build Gemma3 runner with CPU backend"
+	@echo "  clean               - Clean build artifacts"
 
 voxtral-cuda:
 	@echo "==> Building and installing ExecuTorch with CUDA..."
@@ -139,6 +140,15 @@ whisper-cuda:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/whisper/whisper_runner"
 
+whisper-cuda-debug:
+	@echo "==> Building and installing ExecuTorch with CUDA (debug mode)..."
+	cmake --workflow --preset llm-debug-cuda
+	@echo "==> Building Whisper runner with CUDA (debug mode)..."
+	cd examples/models/whisper && cmake --workflow --preset whisper-cuda-debug
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/whisper/whisper_runner"
+
 whisper-cpu:
 	@echo "==> Building and installing ExecuTorch..."
 	cmake --workflow --preset llm-release
diff --git a/examples/models/whisper/CMakePresets.json b/examples/models/whisper/CMakePresets.json
index 1419788cb0c..a5bfc2be5a4 100644
--- a/examples/models/whisper/CMakePresets.json
+++ b/examples/models/whisper/CMakePresets.json
@@ -29,6 +29,20 @@
                 "list": ["Linux", "Windows"]
             }
         },
+        {
+            "name": "whisper-cuda-debug",
+            "displayName": "Whisper runner (CUDA Debug)",
+            "inherits": ["whisper-base"],
+            "cacheVariables": {
+                "EXECUTORCH_BUILD_CUDA": "ON",
+                "CMAKE_BUILD_TYPE": "Debug"
+            },
+            "condition": {
+                "type": "inList",
+                "string": "${hostSystemName}",
+                "list": ["Linux", "Windows"]
+            }
+        },
         {
             "name": "whisper-metal",
             "displayName": "Whisper runner (Metal)",
@@ -56,6 +70,12 @@
             "configurePreset": "whisper-cuda",
             "targets": ["whisper_runner"]
         },
+        {
+            "name": "whisper-cuda-debug",
+            "displayName": "Build Whisper runner (CUDA Debug)",
+            "configurePreset": "whisper-cuda-debug",
+            "targets": ["whisper_runner"]
+        },
         {
             "name": "whisper-metal",
             "displayName": "Build Whisper runner (Metal)",
@@ -92,6 +112,20 @@
                 }
             ]
         },
+        {
+            "name": "whisper-cuda-debug",
+            "displayName": "Configure and build Whisper runner (CUDA Debug)",
+            "steps": [
+                {
+                    "type": "configure",
+                    "name": "whisper-cuda-debug"
+                },
+                {
+                    "type": "build",
+                    "name": "whisper-cuda-debug"
+                }
+            ]
+        },
         {
             "name": "whisper-metal",
             "displayName": "Configure and build Whisper runner (Metal)",

From f02dbe10d2d2fcf5f52e79660fc0819087e47e4c Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Fri, 19 Dec 2025 11:21:04 -0800
Subject: [PATCH 2/7] Update

[ghstack-poisoned]
---
 .github/workflows/cuda.yml                    |  26 ++++
 backends/cuda/runtime/shims/memory.cpp        |  30 +++-
 .../cuda/runtime/shims/tests/CMakeLists.txt   |  60 ++++++++
 .../runtime/shims/tests/CMakePresets.json     |  88 ++++++++++++
 backends/cuda/runtime/shims/tests/README.md   | 132 ++++++++++++++++++
 ..._aoti_torch_create_tensor_from_blob_v2.cpp |  65 +++++++++
 6 files changed, 399 insertions(+), 2 deletions(-)
 create mode 100644 backends/cuda/runtime/shims/tests/CMakeLists.txt
 create mode 100644 backends/cuda/runtime/shims/tests/CMakePresets.json
 create mode 100644 backends/cuda/runtime/shims/tests/README.md

diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 265bc252ee8..904efb4e52f 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -87,6 +87,32 @@ jobs:
         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
         PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
 
+  test-cuda-shims:
+    name: test-cuda-shims
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        # Build ExecuTorch with CUDA support
+        ./install_executorch.sh
+
+        # Build and run CUDA shim tests
+        pushd backends/cuda/runtime/shims/tests
+        cmake --workflow --preset default
+        ctest --preset default --output-on-failure
+        popd
+
   export-model-cuda-artifact:
     name: export-model-cuda-artifact
     # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp
index aaaf3913381..b6310b0ee5a 100644
--- a/backends/cuda/runtime/shims/memory.cpp
+++ b/backends/cuda/runtime/shims/memory.cpp
@@ -119,8 +119,6 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2(
     int32_t layout,
     const uint8_t* opaque_metadata,
     int64_t opaque_metadata_size) {
-  // TODO(gasoonjia): verify given data is on the target device
-  (void)device_type;
   (void)opaque_metadata;
   (void)layout;
   (void)opaque_metadata_size;
@@ -154,6 +152,34 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2(
   // Storage offset must be 0 since from_blob cannot handle different offsets
   ET_CHECK_OK_OR_RETURN_ERROR(validate_storage_offset(storage_offset));
 
+  // Verify that data pointer location matches the requested device_type
+  cudaPointerAttributes data_attributes{};
+  ET_CUDA_CHECK_OR_RETURN_ERROR(
+      cudaPointerGetAttributes(&data_attributes, data));
+
+  bool data_is_on_device = data_attributes.type == cudaMemoryTypeDevice;
+  bool data_is_on_host = data_attributes.type == cudaMemoryTypeHost ||
+      data_attributes.type == cudaMemoryTypeUnregistered;
+  bool requested_device =
+      device_type == static_cast<int32_t>(SupportedDevices::CUDA);
+  bool requested_cpu =
+      device_type == static_cast<int32_t>(SupportedDevices::CPU);
+
+  // Error if data location doesn't match requested device type
+  ET_CHECK_OR_RETURN_ERROR(
+      !(data_is_on_device && requested_cpu),
+      InvalidArgument,
+      "aoti_torch_create_tensor_from_blob_v2 failed: data pointer %p is on CUDA "
+      "but device_type is CPU. Data must be on CPU for CPU tensors.",
+      data);
+
+  ET_CHECK_OR_RETURN_ERROR(
+      !(data_is_on_host && requested_device),
+      InvalidArgument,
+      "aoti_torch_create_tensor_from_blob_v2 failed: data pointer %p is on CPU "
+      "but device_type is CUDA. Data must be on GPU for CUDA tensors.",
+      data);
+
   // Convert sizes to the format expected by ExecutorTorch using SizesType
   std::vector<executorch::aten::SizesType> sizes =
       convert_sizes_to_vector(ndim, sizes_ptr);
diff --git a/backends/cuda/runtime/shims/tests/CMakeLists.txt b/backends/cuda/runtime/shims/tests/CMakeLists.txt
new file mode 100644
index 00000000000..f626e067627
--- /dev/null
+++ b/backends/cuda/runtime/shims/tests/CMakeLists.txt
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.19)
+project(aoti_cuda_shim_tests LANGUAGES CXX CUDA)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Find required packages
+find_package(CUDAToolkit REQUIRED)
+find_package(GTest REQUIRED)
+
+# Get EXECUTORCH_ROOT
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..)
+endif()
+
+# Find installed ExecuTorch
+find_package(executorch CONFIG REQUIRED HINTS ${CMAKE_INSTALL_PREFIX})
+
+# List of test files
+set(CUDA_SHIM_TESTS
+    test_aoti_torch_create_tensor_from_blob_v2
+    test_aoti_torch_empty_strided
+    test_aoti_torch_delete_tensor_object
+    test_aoti_torch__reinterpret_tensor
+    test_aoti_torch_copy_
+    test_aoti_torch_new_tensor_handle
+)
+
+enable_testing()
+
+foreach(test_name ${CUDA_SHIM_TESTS})
+  add_executable(${test_name} ${test_name}.cpp)
+
+  target_include_directories(
+    ${test_name}
+    PRIVATE ${EXECUTORCH_ROOT}
+            ${CUDAToolkit_INCLUDE_DIRS}
+  )
+
+  target_link_libraries(
+    ${test_name}
+    PRIVATE GTest::gtest
+            GTest::gtest_main
+            aoti_cuda_shims
+            aoti_cuda_backend
+            cuda_tensor_maker
+            cuda_platform
+            executorch_core
+            extension_tensor
+            CUDA::cudart
+  )
+
+  add_test(NAME ${test_name} COMMAND ${test_name})
+endforeach()
diff --git a/backends/cuda/runtime/shims/tests/CMakePresets.json b/backends/cuda/runtime/shims/tests/CMakePresets.json
new file mode 100644
index 00000000000..f4e4ee7253e
--- /dev/null
+++ b/backends/cuda/runtime/shims/tests/CMakePresets.json
@@ -0,0 +1,88 @@
+{
+    "version": 6,
+    "configurePresets": [
+        {
+            "name": "default",
+            "displayName": "CUDA Shim Tests",
+            "binaryDir": "${sourceDir}/../../../../../cmake-out/backends/cuda/runtime/shims/tests",
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "Release",
+                "CMAKE_PREFIX_PATH": "${sourceDir}/../../../../../cmake-out"
+            },
+            "condition": {
+                "type": "inList",
+                "string": "${hostSystemName}",
+                "list": ["Linux", "Windows"]
+            }
+        },
+        {
+            "name": "debug",
+            "displayName": "CUDA Shim Tests (Debug)",
+            "inherits": ["default"],
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "Debug"
+            }
+        }
+    ],
+    "buildPresets": [
+        {
+            "name": "default",
+            "displayName": "Build CUDA Shim Tests",
+            "configurePreset": "default"
+        },
+        {
+            "name": "debug",
+            "displayName": "Build CUDA Shim Tests (Debug)",
+            "configurePreset": "debug"
+        }
+    ],
+    "workflowPresets": [
+        {
+            "name": "default",
+            "displayName": "Configure and build CUDA Shim Tests",
+            "steps": [
+                {
+                    "type": "configure",
+                    "name": "default"
+                },
+                {
+                    "type": "build",
+                    "name": "default"
+                }
+            ]
+        },
+        {
+            "name": "debug",
+            "displayName": "Configure and build CUDA Shim Tests (Debug)",
+            "steps": [
+                {
+                    "type": "configure",
+                    "name": "debug"
+                },
+                {
+                    "type": "build",
+                    "name": "debug"
+                }
+            ]
+        }
+    ],
+    "testPresets": [
+        {
+            "name": "default",
+            "displayName": "Run all CUDA Shim Tests",
+            "configurePreset": "default",
+            "output": {
+                "outputOnFailure": true
+            }
+        },
+        {
+            "name": "debug",
+            "displayName": "Run all CUDA Shim Tests (Debug)",
+            "configurePreset": "debug",
+            "output": {
+                "outputOnFailure": true
+            }
+        }
+    ]
+}
+
diff --git a/backends/cuda/runtime/shims/tests/README.md b/backends/cuda/runtime/shims/tests/README.md
new file mode 100644
index 00000000000..2c4b91570b0
--- /dev/null
+++ b/backends/cuda/runtime/shims/tests/README.md
@@ -0,0 +1,132 @@
+# CUDA AOTI Shim Tests
+
+Unit tests for the CUDA AOTI (Ahead-Of-Time Inductor) shim functions used by the ExecuTorch CUDA backend.
+
+## Prerequisites
+
+1. **CUDA Toolkit**: Ensure CUDA is installed and available
+2. **ExecuTorch with CUDA**: Build and install ExecuTorch with CUDA support first
+
+## Building ExecuTorch with CUDA
+
+From the ExecuTorch root directory:
+
+```bash
+# Release build
+cmake --workflow --preset llm-release-cuda
+
+# Or debug build (recommended for debugging test failures)
+cmake --workflow --preset llm-debug-cuda
+```
+
+## Building the Tests
+
+### Option 1: Using CMake Presets (Recommended)
+
+From this directory (`backends/cuda/runtime/shims/tests/`):
+
+```bash
+# Release build
+cmake --workflow --preset default
+
+# Debug build
+cmake --workflow --preset debug
+```
+
+### Option 2: Manual CMake Commands
+
+From the ExecuTorch root directory:
+
+```bash
+# Configure
+cmake -B cmake-out/backends/cuda/runtime/shims/tests \
+  -S backends/cuda/runtime/shims/tests \
+  -DCMAKE_PREFIX_PATH=$(pwd)/cmake-out \
+  -DCMAKE_BUILD_TYPE=Debug
+
+# Build
+cmake --build cmake-out/backends/cuda/runtime/shims/tests -j$(nproc)
+```
+
+## Running the Tests
+
+### Run All Tests
+
+```bash
+# Using ctest (from the build directory)
+cd cmake-out/backends/cuda/runtime/shims/tests
+ctest --output-on-failure
+
+# Or using the test preset (from this directory)
+ctest --preset default
+```
+
+### Run a Specific Test
+
+```bash
+# From the build directory
+./test_aoti_torch_create_tensor_from_blob_v2
+./test_aoti_torch_empty_strided
+./test_aoti_torch_delete_tensor_object
+./test_aoti_torch_copy_
+./test_aoti_torch_new_tensor_handle
+./test_aoti_torch_item_bool
+./test_aoti_torch_assign_tensors_out
+```
+
+### Run Specific Test Cases
+
+Use Google Test filters to run specific test cases:
+
+```bash
+# Run only device mismatch tests
+./test_aoti_torch_create_tensor_from_blob_v2 --gtest_filter="*DeviceMismatch*"
+
+# Run a single test
+./test_aoti_torch_create_tensor_from_blob_v2 --gtest_filter="AOTITorchCreateTensorFromBlobV2Test.BasicFunctionalityCUDA"
+
+# List all available tests
+./test_aoti_torch_create_tensor_from_blob_v2 --gtest_list_tests
+```
+
+## Test Descriptions
+
+| Test File | Description |
+|-----------|-------------|
+| `test_aoti_torch_create_tensor_from_blob_v2` | Tests tensor creation from existing memory blobs, including device type validation |
+| `test_aoti_torch_empty_strided` | Tests creation of uninitialized tensors with specified strides |
+| `test_aoti_torch_delete_tensor_object` | Tests proper tensor deletion and memory management |
+| `test_aoti_torch__reinterpret_tensor` | Tests tensor view reinterpretation with different shapes/strides |
+| `test_aoti_torch_copy_` | Tests tensor copy operations between CPU and CUDA |
+| `test_aoti_torch_new_tensor_handle` | Tests creating new tensor handles that share memory |
+| `test_aoti_torch_item_bool` | Tests extracting boolean values from scalar tensors |
+| `test_aoti_torch_assign_tensors_out` | Tests creating tensor views that share underlying data |
+
+## Troubleshooting
+
+### CUDA Not Available
+
+If tests are skipped with "CUDA not available", ensure:
+- CUDA drivers are installed
+- A CUDA-capable GPU is present
+- `nvidia-smi` shows the GPU
+
+### Link Errors
+
+If you get link errors, ensure ExecuTorch was built with CUDA support:
+```bash
+cmake --workflow --preset llm-release-cuda
+```
+
+### Test Failures
+
+For debugging test failures, build with debug mode:
+```bash
+cmake --workflow --preset debug
+```
+
+Then run with verbose output:
+```bash
+./test_aoti_torch_create_tensor_from_blob_v2 --gtest_break_on_failure
+```
+
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp
index d9b785a5a78..db0ab84970d 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp
@@ -752,3 +752,68 @@ TEST_F(AOTITorchCreateTensorFromBlobV2Test, StressTestManySmallTensors) {
     EXPECT_EQ(error, Error::Ok);
   }
 }
+
+// Test device type mismatch: CPU data with CUDA device request should fail
+TEST_F(AOTITorchCreateTensorFromBlobV2Test, DeviceMismatchCPUDataCUDADevice) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+
+  // Allocate CPU memory
+  size_t bytes = calculate_numel(sizes) * sizeof(float);
+  void* cpu_data = allocate_cpu_memory(bytes);
+  ASSERT_NE(cpu_data, nullptr);
+
+  Tensor* tensor;
+  // Request CUDA device but provide CPU memory - should fail
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      cpu_data,
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      0, // storage_offset
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CUDA), // Request CUDA
+      0, // device index
+      &tensor,
+      0, // layout
+      nullptr, // opaque_metadata
+      0); // opaque_metadata_size
+
+  EXPECT_EQ(error, Error::InvalidArgument)
+      << "Should fail when CPU data is provided but CUDA device is requested";
+}
+
+// Test device type mismatch: CUDA data with CPU device request should fail
+TEST_F(AOTITorchCreateTensorFromBlobV2Test, DeviceMismatchCUDADataCPUDevice) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+
+  // Allocate CUDA memory (device memory, not managed)
+  size_t bytes = calculate_numel(sizes) * sizeof(float);
+  void* cuda_data = nullptr;
+  cudaError_t cuda_err = cudaMalloc(&cuda_data, bytes);
+  ASSERT_EQ(cuda_err, cudaSuccess);
+  ASSERT_NE(cuda_data, nullptr);
+
+  Tensor* tensor;
+  // Request CPU device but provide CUDA memory - should fail
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      cuda_data,
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      0, // storage_offset
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CPU), // Request CPU
+      0, // device index
+      &tensor,
+      0, // layout
+      nullptr, // opaque_metadata
+      0); // opaque_metadata_size
+
+  EXPECT_EQ(error, Error::InvalidArgument)
+      << "Should fail when CUDA data is provided but CPU device is requested";
+
+  // Clean up the CUDA memory we allocated directly
+  cudaFree(cuda_data);
+}

From a97933b8acc62a5f8fa6ca837a1b4675228c7806 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Mon, 22 Dec 2025 08:00:16 +0000
Subject: [PATCH 3/7] Update

[ghstack-poisoned]
---
 .../cuda/runtime/shims/tests/CMakeLists.txt   | 20 ++++---
 .../runtime/shims/tests/CMakePresets.json     |  1 -
 backends/cuda/runtime/shims/tests/README.md   | 54 +++----------------
 3 files changed, 17 insertions(+), 58 deletions(-)

diff --git a/backends/cuda/runtime/shims/tests/CMakeLists.txt b/backends/cuda/runtime/shims/tests/CMakeLists.txt
index 9c367ab8e1a..a7df6075c37 100644
--- a/backends/cuda/runtime/shims/tests/CMakeLists.txt
+++ b/backends/cuda/runtime/shims/tests/CMakeLists.txt
@@ -21,7 +21,10 @@ FetchContent_Declare(
   GIT_TAG v1.14.0
 )
 # For Windows: Prevent overriding the parent project's compiler/linker settings
-set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+set(gtest_force_shared_crt
+    ON
+    CACHE BOOL "" FORCE
+)
 FetchContent_MakeAvailable(googletest)
 
 # Get EXECUTORCH_ROOT
@@ -34,12 +37,9 @@ find_package(executorch CONFIG REQUIRED HINTS ${CMAKE_INSTALL_PREFIX})
 
 # List of test files
 set(CUDA_SHIM_TESTS
-    test_aoti_torch_create_tensor_from_blob_v2
-    test_aoti_torch_empty_strided
-    test_aoti_torch_delete_tensor_object
-    test_aoti_torch__reinterpret_tensor
-    test_aoti_torch_copy_
-    test_aoti_torch_new_tensor_handle
+    test_aoti_torch_create_tensor_from_blob_v2 test_aoti_torch_empty_strided
+    test_aoti_torch_delete_tensor_object test_aoti_torch__reinterpret_tensor
+    test_aoti_torch_copy_ test_aoti_torch_new_tensor_handle
 )
 
 enable_testing()
@@ -48,10 +48,8 @@ foreach(test_name ${CUDA_SHIM_TESTS})
   add_executable(${test_name} ${test_name}.cpp)
 
   target_include_directories(
-    ${test_name}
-    PRIVATE ${EXECUTORCH_ROOT}/..
-            ${EXECUTORCH_ROOT}
-            ${CUDAToolkit_INCLUDE_DIRS}
+    ${test_name} PRIVATE ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}
+                         ${CUDAToolkit_INCLUDE_DIRS}
   )
 
   target_link_libraries(
diff --git a/backends/cuda/runtime/shims/tests/CMakePresets.json b/backends/cuda/runtime/shims/tests/CMakePresets.json
index 40e0dbbd698..33b448f1538 100644
--- a/backends/cuda/runtime/shims/tests/CMakePresets.json
+++ b/backends/cuda/runtime/shims/tests/CMakePresets.json
@@ -93,4 +93,3 @@
         }
     ]
 }
-
diff --git a/backends/cuda/runtime/shims/tests/README.md b/backends/cuda/runtime/shims/tests/README.md
index 2c4b91570b0..1456447eeba 100644
--- a/backends/cuda/runtime/shims/tests/README.md
+++ b/backends/cuda/runtime/shims/tests/README.md
@@ -13,13 +13,13 @@ From the ExecuTorch root directory:
 
 ```bash
 # Release build
-cmake --workflow --preset llm-release-cuda
+cmake --workflow llm-release-cuda
 
 # Or debug build (recommended for debugging test failures)
-cmake --workflow --preset llm-debug-cuda
+cmake --workflow llm-debug-cuda
 ```
 
-## Building the Tests
+## Building and Run the Tests
 
 ### Option 1: Using CMake Presets (Recommended)
 
@@ -27,10 +27,10 @@ From this directory (`backends/cuda/runtime/shims/tests/`):
 
 ```bash
 # Release build
-cmake --workflow --preset default
+cmake --workflow default
 
 # Debug build
-cmake --workflow --preset debug
+cmake --workflow debug
 ```
 
 ### Option 2: Manual CMake Commands
@@ -48,37 +48,13 @@ cmake -B cmake-out/backends/cuda/runtime/shims/tests \
 cmake --build cmake-out/backends/cuda/runtime/shims/tests -j$(nproc)
 ```
 
-## Running the Tests
-
-### Run All Tests
-
-```bash
-# Using ctest (from the build directory)
-cd cmake-out/backends/cuda/runtime/shims/tests
-ctest --output-on-failure
-
-# Or using the test preset (from this directory)
-ctest --preset default
-```
-
-### Run a Specific Test
-
-```bash
-# From the build directory
-./test_aoti_torch_create_tensor_from_blob_v2
-./test_aoti_torch_empty_strided
-./test_aoti_torch_delete_tensor_object
-./test_aoti_torch_copy_
-./test_aoti_torch_new_tensor_handle
-./test_aoti_torch_item_bool
-./test_aoti_torch_assign_tensors_out
-```
-
 ### Run Specific Test Cases
 
 Use Google Test filters to run specific test cases:
 
 ```bash
+# From the build directory
+cd cmake-out/backends/cuda/runtime/shims/tests
 # Run only device mismatch tests
 ./test_aoti_torch_create_tensor_from_blob_v2 --gtest_filter="*DeviceMismatch*"
 
@@ -89,19 +65,6 @@ Use Google Test filters to run specific test cases:
 ./test_aoti_torch_create_tensor_from_blob_v2 --gtest_list_tests
 ```
 
-## Test Descriptions
-
-| Test File | Description |
-|-----------|-------------|
-| `test_aoti_torch_create_tensor_from_blob_v2` | Tests tensor creation from existing memory blobs, including device type validation |
-| `test_aoti_torch_empty_strided` | Tests creation of uninitialized tensors with specified strides |
-| `test_aoti_torch_delete_tensor_object` | Tests proper tensor deletion and memory management |
-| `test_aoti_torch__reinterpret_tensor` | Tests tensor view reinterpretation with different shapes/strides |
-| `test_aoti_torch_copy_` | Tests tensor copy operations between CPU and CUDA |
-| `test_aoti_torch_new_tensor_handle` | Tests creating new tensor handles that share memory |
-| `test_aoti_torch_item_bool` | Tests extracting boolean values from scalar tensors |
-| `test_aoti_torch_assign_tensors_out` | Tests creating tensor views that share underlying data |
-
 ## Troubleshooting
 
 ### CUDA Not Available
@@ -122,11 +85,10 @@ cmake --workflow --preset llm-release-cuda
 
 For debugging test failures, build with debug mode:
 ```bash
-cmake --workflow --preset debug
+cmake --workflow debug
 ```
 
 Then run with verbose output:
 ```bash
 ./test_aoti_torch_create_tensor_from_blob_v2 --gtest_break_on_failure
 ```
-

From e1bb6c241cc008b96e63689b43c194521022fe80 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Mon, 22 Dec 2025 17:40:55 +0000
Subject: [PATCH 4/7] Update

[ghstack-poisoned]
---
 backends/cuda/runtime/shims/tests/README.md | 10 +--
 extension/llm/custom_ops/custom_ops.py      | 86 +++++++++++++++++++++
 2 files changed, 91 insertions(+), 5 deletions(-)

diff --git a/backends/cuda/runtime/shims/tests/README.md b/backends/cuda/runtime/shims/tests/README.md
index 1456447eeba..3e95280811b 100644
--- a/backends/cuda/runtime/shims/tests/README.md
+++ b/backends/cuda/runtime/shims/tests/README.md
@@ -13,10 +13,10 @@ From the ExecuTorch root directory:
 
 ```bash
 # Release build
-cmake --workflow llm-release-cuda
+cmake --workflow --preset llm-release-cuda
 
 # Or debug build (recommended for debugging test failures)
-cmake --workflow llm-debug-cuda
+cmake --workflow --preset llm-debug-cuda
 ```
 
 ## Building and Run the Tests
@@ -27,10 +27,10 @@ From this directory (`backends/cuda/runtime/shims/tests/`):
 
 ```bash
 # Release build
-cmake --workflow default
+cmake --workflow --preset default
 
 # Debug build
-cmake --workflow debug
+cmake --workflow --preset debug
 ```
 
 ### Option 2: Manual CMake Commands
@@ -85,7 +85,7 @@ cmake --workflow --preset llm-release-cuda
 
 For debugging test failures, build with debug mode:
 ```bash
-cmake --workflow debug
+cmake --workflow --preset debug
 ```
 
 Then run with verbose output:
diff --git a/extension/llm/custom_ops/custom_ops.py b/extension/llm/custom_ops/custom_ops.py
index dfa357fe356..51c9601454a 100644
--- a/extension/llm/custom_ops/custom_ops.py
+++ b/extension/llm/custom_ops/custom_ops.py
@@ -16,6 +16,12 @@
 
 from torch.library import impl
 
+from typing import Tuple
+
+from torch._inductor.lowering import lowerings as L, register_lowering
+
+aten = torch.ops.aten
+
 try:
     op = torch.ops.llama.sdpa_with_kv_cache.default
     assert op is not None
@@ -387,3 +393,83 @@ def custom_quantized_sdpa_meta(
     )
 
     return torch.empty(query.size(), dtype=torch.float32, device="meta")
+
+# 1) Define the custom op in the "executorch" namespace with name "alias"
+@torch.library.custom_op("executorch::alias", mutates_args=())
+def custom_alias(x: torch.Tensor, y: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    # no copies, just pass-through
+    return x, y
+
+
+# 2) FakeTensor kernel: describes output metadata for compile-time
+@custom_alias.register_fake
+def _(x, y):
+    # For this op, outputs have exactly the same shape/dtype/device as inputs.
+    # We just need *dummy* tensors with that metadata.
+    out_x = torch.empty_like(x)
+    out_y = torch.empty_like(y)
+    return out_x, out_y
+
+
+@register_lowering(torch.ops.executorch.alias.default)
+def lowering_custom_alias(x, y):
+    # x, y here are IR values (Inductor's internal representation).
+    # Alias is logically a no-op – just pass them through.
+    return x, y
+
+
+# Expecting cache shape: (B, H, S_max, D), value shape (B, H, S, D) where S <= S_max
+def _validate_cross_attn_cache_params(value: torch.Tensor, cache: torch.Tensor):
+    torch._assert(value.dim() == 4, "value must be 4D")
+    torch._assert(cache.dim() == 4, "cache must be 4D")
+    # Cache shape: (B, H, S_max, D)
+    # Value shape: (B, H, S, D)
+    torch._assert(
+        value.size(2) <= cache.size(2),
+        f"value sequence length {value.size(2)} exceeds cache size {cache.size(2)}",
+    )
+    torch._assert(value.size(0) == cache.size(0), "batch size mismatch")
+    torch._assert(value.size(1) == cache.size(1), "num heads mismatch")
+    torch._assert(value.size(3) == cache.size(3), "head dim mismatch")
+    torch._assert(value.dtype == cache.dtype, "dtype mismatch")
+
+
+# This is cheating: we delibrately NOT mark `cache` to be mutating so that this
+# custom op can be used in HOP such as `torch.cond`, where `torch.compile` requires
+# no aliasing or mutation in the branches. This is fine because we only care about inference.
+@torch.library.custom_op("executorch::update_cross_attn_cache", mutates_args=[])
+def _update_cross_attn_cache(value: torch.Tensor, cache: torch.Tensor) -> torch.Tensor:
+    # Eager implementation
+    _validate_cross_attn_cache_params(value, cache)
+
+    # Slice the cache to match value's sequence length and copy
+    # cache shape: [B, H, S_max, D]
+    # value shape: [B, H, S, D]
+    cache[:, :, : value.size(2), :].copy_(value)
+    return cache
+
+
+# Register the fake (meta) kernel
+@_update_cross_attn_cache.register_fake
+def _update_cross_attn_cache_fake(
+    value: torch.Tensor, cache: torch.Tensor
+) -> torch.Tensor:
+    _validate_cross_attn_cache_params(value, cache)
+    return torch.empty_like(cache)
+
+
+# Register Inductor lowering
+@register_lowering(torch.ops.executorch.update_cross_attn_cache)
+def _update_cross_attn_cache_lowering(value, cache):
+    # cache shape: [B, H, S_max, D]
+    # value shape: [B, H, S, D]
+
+    # We need to slice the cache along dim 2 (sequence length)
+    # slice(self, dim, start, end, step=1)
+    seq_len = value.get_size()[2]
+    cache_slice = L[aten.slice.Tensor](cache, 2, 0, seq_len, 1)
+
+    # Copy value into the slice
+    L[aten.copy_.default](cache_slice, value)
+
+    return cache
\ No newline at end of file

From 2a7a9f0d96c974adfcf9dd552ec88b92540736f1 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Mon, 22 Dec 2025 17:43:43 +0000
Subject: [PATCH 5/7] Update

[ghstack-poisoned]
---
 extension/llm/custom_ops/custom_ops.py | 86 --------------------------
 1 file changed, 86 deletions(-)

diff --git a/extension/llm/custom_ops/custom_ops.py b/extension/llm/custom_ops/custom_ops.py
index 51c9601454a..dfa357fe356 100644
--- a/extension/llm/custom_ops/custom_ops.py
+++ b/extension/llm/custom_ops/custom_ops.py
@@ -16,12 +16,6 @@
 
 from torch.library import impl
 
-from typing import Tuple
-
-from torch._inductor.lowering import lowerings as L, register_lowering
-
-aten = torch.ops.aten
-
 try:
     op = torch.ops.llama.sdpa_with_kv_cache.default
     assert op is not None
@@ -393,83 +387,3 @@ def custom_quantized_sdpa_meta(
     )
 
     return torch.empty(query.size(), dtype=torch.float32, device="meta")
-
-# 1) Define the custom op in the "executorch" namespace with name "alias"
-@torch.library.custom_op("executorch::alias", mutates_args=())
-def custom_alias(x: torch.Tensor, y: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-    # no copies, just pass-through
-    return x, y
-
-
-# 2) FakeTensor kernel: describes output metadata for compile-time
-@custom_alias.register_fake
-def _(x, y):
-    # For this op, outputs have exactly the same shape/dtype/device as inputs.
-    # We just need *dummy* tensors with that metadata.
-    out_x = torch.empty_like(x)
-    out_y = torch.empty_like(y)
-    return out_x, out_y
-
-
-@register_lowering(torch.ops.executorch.alias.default)
-def lowering_custom_alias(x, y):
-    # x, y here are IR values (Inductor's internal representation).
-    # Alias is logically a no-op – just pass them through.
-    return x, y
-
-
-# Expecting cache shape: (B, H, S_max, D), value shape (B, H, S, D) where S <= S_max
-def _validate_cross_attn_cache_params(value: torch.Tensor, cache: torch.Tensor):
-    torch._assert(value.dim() == 4, "value must be 4D")
-    torch._assert(cache.dim() == 4, "cache must be 4D")
-    # Cache shape: (B, H, S_max, D)
-    # Value shape: (B, H, S, D)
-    torch._assert(
-        value.size(2) <= cache.size(2),
-        f"value sequence length {value.size(2)} exceeds cache size {cache.size(2)}",
-    )
-    torch._assert(value.size(0) == cache.size(0), "batch size mismatch")
-    torch._assert(value.size(1) == cache.size(1), "num heads mismatch")
-    torch._assert(value.size(3) == cache.size(3), "head dim mismatch")
-    torch._assert(value.dtype == cache.dtype, "dtype mismatch")
-
-
-# This is cheating: we delibrately NOT mark `cache` to be mutating so that this
-# custom op can be used in HOP such as `torch.cond`, where `torch.compile` requires
-# no aliasing or mutation in the branches. This is fine because we only care about inference.
-@torch.library.custom_op("executorch::update_cross_attn_cache", mutates_args=[])
-def _update_cross_attn_cache(value: torch.Tensor, cache: torch.Tensor) -> torch.Tensor:
-    # Eager implementation
-    _validate_cross_attn_cache_params(value, cache)
-
-    # Slice the cache to match value's sequence length and copy
-    # cache shape: [B, H, S_max, D]
-    # value shape: [B, H, S, D]
-    cache[:, :, : value.size(2), :].copy_(value)
-    return cache
-
-
-# Register the fake (meta) kernel
-@_update_cross_attn_cache.register_fake
-def _update_cross_attn_cache_fake(
-    value: torch.Tensor, cache: torch.Tensor
-) -> torch.Tensor:
-    _validate_cross_attn_cache_params(value, cache)
-    return torch.empty_like(cache)
-
-
-# Register Inductor lowering
-@register_lowering(torch.ops.executorch.update_cross_attn_cache)
-def _update_cross_attn_cache_lowering(value, cache):
-    # cache shape: [B, H, S_max, D]
-    # value shape: [B, H, S, D]
-
-    # We need to slice the cache along dim 2 (sequence length)
-    # slice(self, dim, start, end, step=1)
-    seq_len = value.get_size()[2]
-    cache_slice = L[aten.slice.Tensor](cache, 2, 0, seq_len, 1)
-
-    # Copy value into the slice
-    L[aten.copy_.default](cache_slice, value)
-
-    return cache
\ No newline at end of file

From 8b9408775df14d8adee82a8a0f9e8fbd849dee52 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Mon, 22 Dec 2025 19:54:40 +0000
Subject: [PATCH 6/7] Update

[ghstack-poisoned]
---
 .github/workflows/cuda.yml                                | 4 ++--
 backends/cuda/runtime/shims/memory.cpp                    | 8 +++-----
 .../shims/tests/test_aoti_torch__reinterpret_tensor.cpp   | 6 ------
 .../tests/test_aoti_torch_create_tensor_from_blob_v2.cpp  | 6 ------
 4 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 0dbee174bd3..d12ab2df73e 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -105,11 +105,11 @@ jobs:
         set -eux
 
         # Build ExecuTorch with CUDA support
-        cmake --workflow llm-release-cuda
+        cmake --workflow --preset llm-release-cuda
 
         # Build and run CUDA shim tests
         pushd backends/cuda/runtime/shims/tests
-        cmake --workflow default
+        cmake --workflow --preset default
         popd
 
   export-model-cuda-artifact:
diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp
index 7599bd19b05..ecb1ded2f39 100644
--- a/backends/cuda/runtime/shims/memory.cpp
+++ b/backends/cuda/runtime/shims/memory.cpp
@@ -331,12 +331,10 @@ void clear_all_tensors() {
   // tensors set should now be empty, but ensure it's cleared
   tensors.clear();
 
-  ET_LOG(Info, "Cleared all tensors");
-}
-
-void clear_memory_tracking() {
+  // Clear memory tracking map (includes leftover NOT_OWN entries)
   memory_to_n_tensor.clear();
-  ET_LOG(Info, "Cleared memory tracking map");
+
+  ET_LOG(Info, "Cleared all tensors and memory tracking");
 }
 
 AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) {
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp
index 2e646fb9a66..d3044810b15 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp
@@ -42,9 +42,6 @@ class AOTITorchReinterpretTensorTest : public ::testing::Test {
 
     // Clear any remaining tensors from previous tests
     clear_all_tensors();
-
-    // Clear memory tracking map
-    clear_memory_tracking();
   }
 
   void TearDown() override {
@@ -53,9 +50,6 @@ class AOTITorchReinterpretTensorTest : public ::testing::Test {
 
     // Clear the global tensor storage using the provided function
     clear_all_tensors();
-
-    // Clear memory tracking map
-    clear_memory_tracking();
   }
 
   // Helper to calculate number of elements from sizes
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp
index ce9c2bb633d..db0ab84970d 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp
@@ -40,9 +40,6 @@ class AOTITorchCreateTensorFromBlobV2Test : public ::testing::Test {
 
     // Clear any remaining tensors from previous tests
     clear_all_tensors();
-
-    // Clear memory tracking map
-    clear_memory_tracking();
   }
 
   void TearDown() override {
@@ -52,9 +49,6 @@ class AOTITorchCreateTensorFromBlobV2Test : public ::testing::Test {
     // Clear the global tensor storage using the provided function
     clear_all_tensors();
 
-    // Clear memory tracking map
-    clear_memory_tracking();
-
     // Clean up any allocated memory buffers
     for (void* ptr : cuda_memory_buffers_) {
       if (ptr) {

From 73efe126558e91f2c784847d1f7318a74b235b0a Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Mon, 22 Dec 2025 23:36:29 +0000
Subject: [PATCH 7/7] Update

[ghstack-poisoned]
---
 .github/workflows/cuda.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index d12ab2df73e..dbce874cfc0 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -103,6 +103,8 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
+        # Install requirements
+        bash ./install_requirements.sh
 
         # Build ExecuTorch with CUDA support
         cmake --workflow --preset llm-release-cuda