From 1995f7b14f9c3c6f80713e9785dec12d49e7b00b Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Mon, 15 Dec 2025 10:52:41 -0800
Subject: [PATCH 01/15] Set up windows-cuda stuff script

---
 scripts/install_windows_cuda_deps.py | 330 +++++++++++++++++++++++++++
 1 file changed, 330 insertions(+)
 create mode 100755 scripts/install_windows_cuda_deps.py

diff --git a/scripts/install_windows_cuda_deps.py b/scripts/install_windows_cuda_deps.py
new file mode 100755
index 00000000000..30611dcaaed
--- /dev/null
+++ b/scripts/install_windows_cuda_deps.py
@@ -0,0 +1,330 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Script to install Windows CUDA dependencies for cross-compilation.
+Supports Fedora/RHEL and WSL environments.
+
+Detects CUDA version from the installed PyTorch to ensure compatibility.
+"""
+
+import argparse
+import os
+import platform
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+# Mapping of CUDA versions to their corresponding driver versions for Windows installers
+# Source: https://developer.nvidia.com/cuda-toolkit-archive
+CUDA_DRIVER_VERSION_MAP = {
+    # CUDA 12.9.x
+    "12.9.1": "576.40",
+    "12.9.0": "576.02",
+    # CUDA 12.8.x
+    "12.8.1": "572.17",
+    "12.8.0": "571.96",
+    # CUDA 12.6.x
+    "12.6.3": "561.17",
+    "12.6.2": "560.94",
+    "12.6.1": "560.94",
+    "12.6.0": "560.76",
+}
+
+
+class Colors:
+    RED = "\033[0;31m"
+    GREEN = "\033[0;32m"
+    YELLOW = "\033[1;33m"
+    NC = "\033[0m"  # No Color
+
+
+def log_info(msg: str) -> None:
+    print(f"{Colors.GREEN}[INFO]{Colors.NC} {msg}")
+
+
+def log_warn(msg: str) -> None:
+    print(f"{Colors.YELLOW}[WARN]{Colors.NC} {msg}")
+
+
+def log_error(msg: str) -> None:
+    print(f"{Colors.RED}[ERROR]{Colors.NC} {msg}")
+
+
+def get_pytorch_cuda_version() -> tuple[str, str] | None:
+    """
+    Get the CUDA version from the installed PyTorch.
+
+    Returns:
+        A tuple of (cuda_version, driver_version) if found, None otherwise.
+    """
+    try:
+        import torch
+    except ImportError:
+        log_error("PyTorch is not installed. Cannot detect CUDA version.")
+        return None
+
+    cuda_version = torch.version.cuda
+    if cuda_version is None:
+        log_error("PyTorch is not built with CUDA support.")
+        return None
+
+    log_info(f"Detected PyTorch CUDA version: {cuda_version}")
+
+    # torch.version.cuda returns something like "12.4" (major.minor only)
+    # We need to find a matching full version in our map
+    matching_versions = [
+        v for v in CUDA_DRIVER_VERSION_MAP.keys() if v.startswith(cuda_version)
+    ]
+
+    if not matching_versions:
+        log_error(
+            f"CUDA version {cuda_version} is not in the known version map. "
+            f"Known versions: {', '.join(sorted(CUDA_DRIVER_VERSION_MAP.keys()))}"
+        )
+        return None
+
+    # Use the latest patch version available
+    full_cuda_version = sorted(matching_versions, reverse=True)[0]
+    driver_version = CUDA_DRIVER_VERSION_MAP[full_cuda_version]
+
+    log_info(f"Using CUDA {full_cuda_version} with driver {driver_version}")
+    return full_cuda_version, driver_version
+
+
+def run_command(
+    cmd: list[str], check: bool = True, capture_output: bool = False
+) -> subprocess.CompletedProcess:
+    """Run a command and optionally check for errors."""
+    log_info(f"Running: {' '.join(cmd)}")
+    return subprocess.run(cmd, check=check, capture_output=capture_output, text=True)
+
+
+def detect_environment() -> str:
+    """Detect the current environment (wsl, fedora, or unknown)."""
+    # Check if running on Linux
+    if platform.system() != "Linux":
+        return "unknown"
+
+    # Check for WSL
+    try:
+        with open("/proc/version", "r") as f:
+            if "microsoft" in f.read().lower():
+                return "wsl"
+    except FileNotFoundError:
+        pass
+
+    # Check for RHEL/Fedora
+    if Path("/etc/redhat-release").exists() or shutil.which("dnf"):
+        return "fedora"
+
+    return "unknown"
+
+
+def install_mingw_fedora() -> None:
+    """Install mingw64 on Fedora/RHEL."""
+    log_info("Installing mingw64 for Fedora (dnf)...")
+    run_command(["sudo", "dnf", "install", "-y", "mingw64-gcc-c++"])
+
+    log_info("Verifying installation...")
+    run_command(["x86_64-w64-mingw32-gcc", "--version"])
+
+
+def install_mingw_wsl() -> None:
+    """Install mingw64 on WSL."""
+    log_info("Installing mingw64 for WSL...")
+    run_command(["sudo", "apt", "update"])
+    run_command(["sudo", "apt", "install", "-y", "g++-mingw-w64-x86-64-win32"])
+
+    log_info("Verifying installation...")
+    run_command(["x86_64-w64-mingw32-g++", "--version"])
+
+
+def install_7zip(env_type: str) -> None:
+    """Install 7zip if not already available."""
+    if shutil.which("7z"):
+        log_info("7zip already installed")
+        return
+
+    log_info("Installing 7zip...")
+    if env_type == "fedora":
+        run_command(["sudo", "dnf", "install", "-y", "p7zip", "p7zip-plugins"])
+    else:
+        run_command(["sudo", "apt", "install", "-y", "p7zip-full"])
+
+
+def find_windows_cuda_install(cuda_version: str) -> Path | None:
+    """
+    Check if CUDA is installed on Windows (accessible via WSL mount).
+
+    Args:
+        cuda_version: The full CUDA version (e.g., "12.6.0")
+
+    Returns:
+        Path to the CUDA installation if found, None otherwise.
+    """
+    cuda_major_minor = ".".join(cuda_version.split(".")[:2])
+    windows_cuda_path = Path(
+        f"/mnt/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v{cuda_major_minor}"
+    )
+
+    if windows_cuda_path.exists():
+        log_info(f"Found Windows CUDA installation at: {windows_cuda_path}")
+        return windows_cuda_path
+
+    log_info(f"No Windows CUDA installation found at: {windows_cuda_path}")
+    return None
+
+
+def set_windows_cuda_home(cuda_home_path: Path) -> None:
+    """
+    Set WINDOWS_CUDA_HOME environment variable in the user's shell config.
+
+    Adds the export to ~/.bashrc and ~/.zshrc if they exist.
+    Also sets it in the current environment.
+    """
+    export_line = f'export WINDOWS_CUDA_HOME="{cuda_home_path}"'
+
+    # Set in current environment
+    os.environ["WINDOWS_CUDA_HOME"] = str(cuda_home_path)
+    log_info(f"Set WINDOWS_CUDA_HOME={cuda_home_path}")
+
+    # Add to shell config files
+    shell_configs = [
+        Path.home() / ".bashrc",
+        Path.home() / ".zshrc",
+    ]
+
+    for config_file in shell_configs:
+        if not config_file.exists():
+            continue
+
+        # Check if already set
+        content = config_file.read_text()
+        if "WINDOWS_CUDA_HOME" in content:
+            log_info(f"WINDOWS_CUDA_HOME already in {config_file}, updating...")
+            # Remove old line(s) and add new one
+            lines = [
+                line for line in content.splitlines() if "WINDOWS_CUDA_HOME" not in line
+            ]
+            lines.append(export_line)
+            config_file.write_text("\n".join(lines) + "\n")
+        else:
+            log_info(f"Adding WINDOWS_CUDA_HOME to {config_file}")
+            with open(config_file, "a") as f:
+                f.write(f"\n# Windows CUDA path for cross-compilation\n")
+                f.write(f"{export_line}\n")
+
+
+def download_and_extract_cuda(
+    cuda_version: str, cuda_driver_version: str, install_dir: Path, env_type: str
+) -> None:
+    """Download and extract CUDA toolkit for Windows."""
+    log_info("Setting up CUDA toolkit for Windows cross-compilation...")
+
+    install_dir.mkdir(parents=True, exist_ok=True)
+
+    cuda_installer = f"cuda_{cuda_version}_{cuda_driver_version}_windows.exe"
+    cuda_installer_path = install_dir / cuda_installer
+    cuda_url = (
+        f"https://developer.download.nvidia.com/compute/cuda/{cuda_version}/"
+        f"local_installers/{cuda_installer}"
+    )
+
+    # Download CUDA installer if not present
+    if not cuda_installer_path.exists():
+        log_info(f"Downloading CUDA {cuda_version} Windows installer...")
+        run_command(["wget", cuda_url, "-O", str(cuda_installer_path)])
+    else:
+        log_info("CUDA installer already downloaded, skipping download...")
+
+    # Install 7zip if needed
+    install_7zip(env_type)
+
+    # Extract CUDA toolkit
+    extracted_dir = install_dir / "extracted"
+    if not extracted_dir.exists():
+        log_info("Extracting CUDA toolkit...")
+        run_command(["7z", "x", str(cuda_installer_path), f"-o{extracted_dir}", "-y"])
+    else:
+        log_info("CUDA already extracted, skipping extraction...")
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Install Windows CUDA dependencies for cross-compilation. "
+        "CUDA version is automatically detected from PyTorch installation."
+    )
+    parser.add_argument(
+        "--install-dir",
+        type=Path,
+        default=Path(os.environ.get("INSTALL_DIR", Path.home() / "cuda-windows")),
+        help="Installation directory (default: $HOME/cuda-windows)",
+    )
+
+    args = parser.parse_args()
+
+    env_type = detect_environment()
+    log_info(f"Detected environment: {env_type}")
+
+    if env_type == "unknown":
+        log_error("Unknown environment. This script supports Fedora/RHEL and WSL.")
+        return 1
+
+    # Install mingw
+    try:
+        if env_type == "fedora":
+            install_mingw_fedora()
+        elif env_type == "wsl":
+            install_mingw_wsl()
+    except subprocess.CalledProcessError as e:
+        log_error(f"Failed to install mingw: {e}")
+        return 1
+
+    # Get CUDA version from PyTorch
+    cuda_info = get_pytorch_cuda_version()
+    if cuda_info is None:
+        return 1
+
+    cuda_version, cuda_driver_version = cuda_info
+
+    # For WSL, check if CUDA is already installed on Windows
+    if env_type == "wsl":
+        windows_cuda_path = find_windows_cuda_install(cuda_version)
+        if windows_cuda_path is not None:
+            log_info("Using existing Windows CUDA installation.")
+            set_windows_cuda_home(windows_cuda_path)
+            log_info("")
+            log_info("Installation complete!")
+            return 0
+
+        log_info("Will download CUDA toolkit instead...")
+
+    # Download and extract CUDA
+    try:
+        download_and_extract_cuda(
+            cuda_version,
+            cuda_driver_version,
+            args.install_dir,
+            env_type,
+        )
+
+        cuda_home_path = args.install_dir / "extracted" / "cuda_cudart" / "cudart"
+        set_windows_cuda_home(cuda_home_path)
+    except subprocess.CalledProcessError as e:
+        log_error(f"Failed to download/extract CUDA: {e}")
+        return 1
+
+    log_info("")
+    log_info("Installation complete!")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 13ab5f85756a4353d7b136eb5546e6eb20281dbd Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Mon, 15 Dec 2025 10:52:57 -0800
Subject: [PATCH 02/15] Ci spec

---
 .ci/scripts/export_model_artifact.sh | 17 ++++--
 .github/workflows/cuda-windows.yml   | 79 ++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/cuda-windows.yml

diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
index 3c173b0ea2a..188f375202f 100755
--- a/.ci/scripts/export_model_artifact.sh
+++ b/.ci/scripts/export_model_artifact.sh
@@ -58,11 +58,13 @@ OUTPUT_DIR="${4:-.}"
 case "$DEVICE" in
   cuda)
     ;;
+  cuda-windows)
+    ;;
   metal)
     ;;
   *)
     echo "Error: Unsupported device '$DEVICE'"
-    echo "Supported devices: cuda, metal"
+    echo "Supported devices: cuda, cuda-windows, metal"
     exit 1
     ;;
 esac
@@ -147,7 +149,7 @@ if [ -n "$MAX_SEQ_LEN" ]; then
 fi
 
 DEVICE_ARG=""
-if [ "$DEVICE" = "cuda" ]; then
+if [ "$DEVICE" = "cuda" ] || [ "$DEVICE" = "cuda-windows" ]; then
   DEVICE_ARG="--device cuda"
 fi
 
@@ -169,8 +171,15 @@ if [ -n "$PREPROCESSOR_OUTPUT" ]; then
       --output_file $PREPROCESSOR_OUTPUT
 fi
 
+# Determine blob file name - cuda and cuda-windows both use aoti_cuda_blob.ptd
+if [ "$DEVICE" = "cuda" ] || [ "$DEVICE" = "cuda-windows" ]; then
+  BLOB_FILE="aoti_cuda_blob.ptd"
+else
+  BLOB_FILE="aoti_${DEVICE}_blob.ptd"
+fi
+
 test -f model.pte
-test -f aoti_${DEVICE}_blob.ptd
+test -f $BLOB_FILE
 if [ -n "$PREPROCESSOR_OUTPUT" ]; then
   test -f $PREPROCESSOR_OUTPUT
 fi
@@ -179,7 +188,7 @@ echo "::endgroup::"
 echo "::group::Store $MODEL_NAME Artifacts"
 mkdir -p "${OUTPUT_DIR}"
 mv model.pte "${OUTPUT_DIR}/"
-mv aoti_${DEVICE}_blob.ptd "${OUTPUT_DIR}/"
+mv $BLOB_FILE "${OUTPUT_DIR}/"
 if [ -n "$PREPROCESSOR_OUTPUT" ]; then
   mv $PREPROCESSOR_OUTPUT "${OUTPUT_DIR}/"
 fi
diff --git a/.github/workflows/cuda-windows.yml b/.github/workflows/cuda-windows.yml
new file mode 100644
index 00000000000..98e950aec05
--- /dev/null
+++ b/.github/workflows/cuda-windows.yml
@@ -0,0 +1,79 @@
+# Test ExecuTorch CUDA Windows Cross-Compilation Export
+# This workflow tests model export targeting CUDA Windows using optimum-executorch.
+# It runs on a Linux machine with CUDA and uses the install_windows_cuda_deps.py
+# script to install the Windows CUDA cross-compilation dependencies.
+
+name: Test CUDA Windows Export
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+      - release/*
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: false
+
+jobs:
+  export-model-cuda-windows-artifact:
+    name: export-model-cuda-windows-artifact
+    # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
+    if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    secrets: inherit
+    strategy:
+      fail-fast: false
+      matrix:
+        model:
+          - repo: "mistralai"
+            name: "Voxtral-Mini-3B-2507"
+          - repo: "openai"
+            name: "whisper-small"
+          - repo: "openai"
+            name: "whisper-large-v3-turbo"
+          - repo: "google"
+            name: "gemma-3-4b-it"
+        quant:
+          - "non-quantized"
+          - "quantized-int4-tile-packed"
+          - "quantized-int4-weight-only"
+        exclude:
+          # TODO: enable int4-weight-only on gemma3.
+          - model:
+              repo: "google"
+              name: "gemma-3-4b-it"
+            quant: "quantized-int4-weight-only"
+    with:
+      timeout: 90
+      secrets-env: EXECUTORCH_HF_TOKEN
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      upload-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-windows-${{ matrix.quant }}
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch"
+        ./install_executorch.sh
+        echo "::endgroup::"
+
+        echo "::group::Install Windows CUDA Dependencies"
+        python scripts/install_windows_cuda_deps.py
+        echo "::endgroup::"
+
+        echo "::group::Setup Huggingface"
+        pip install -U "huggingface_hub[cli]<1.0" accelerate
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        echo "::endgroup::"
+
+        source .ci/scripts/export_model_artifact.sh cuda-windows "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"

From 567e12e0ff16247a197de401ec6c50d17a48b139 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Mon, 15 Dec 2025 14:10:31 -0800
Subject: [PATCH 03/15] try 2 for ci

---
 .ci/docker/build.sh                           |   9 ++
 .../install_cuda_windows_cross_compile.sh     | 144 ++++++++++++++++++
 .ci/docker/common/install_pytorch_cuda.sh     |  30 ++++
 .ci/docker/ubuntu/Dockerfile                  |  12 ++
 .github/workflows/cuda-windows.yml            |  20 ++-
 .github/workflows/docker-builds.yml           |   3 +-
 6 files changed, 209 insertions(+), 9 deletions(-)
 create mode 100644 .ci/docker/common/install_cuda_windows_cross_compile.sh
 create mode 100644 .ci/docker/common/install_pytorch_cuda.sh

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 5b46e62067f..97347d5e5fe 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -67,6 +67,13 @@ case "${IMAGE_NAME}" in
     # From https://developer.android.com/ndk/downloads
     ANDROID_NDK_VERSION=r28c
     ;;
+  executorch-ubuntu-22.04-cuda-windows)
+    LINTRUNNER=""
+    GCC_VERSION=11
+    CUDA_WINDOWS_CROSS_COMPILE=yes
+    CUDA_VERSION=12.8
+    SKIP_PYTORCH=yes
+    ;;
   *)
     echo "Invalid image name ${IMAGE_NAME}"
     exit 1
@@ -101,6 +108,8 @@ docker build \
   --build-arg "MEDIATEK_SDK=${MEDIATEK_SDK:-}" \
   --build-arg "ANDROID_NDK_VERSION=${ANDROID_NDK_VERSION:-}" \
   --build-arg "SKIP_PYTORCH=${SKIP_PYTORCH:-}" \
+  --build-arg "CUDA_WINDOWS_CROSS_COMPILE=${CUDA_WINDOWS_CROSS_COMPILE:-}" \
+  --build-arg "CUDA_VERSION=${CUDA_VERSION:-}" \
   -f "${OS}"/Dockerfile \
   "$@" \
   .
diff --git a/.ci/docker/common/install_cuda_windows_cross_compile.sh b/.ci/docker/common/install_cuda_windows_cross_compile.sh
new file mode 100644
index 00000000000..21d4fa76a72
--- /dev/null
+++ b/.ci/docker/common/install_cuda_windows_cross_compile.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Install mingw-w64 cross-compiler and Windows CUDA toolkit for cross-compilation
+
+set -ex
+
+INSTALL_DIR="${WINDOWS_CUDA_INSTALL_DIR:-/opt/cuda-windows}"
+
+# Mapping of CUDA versions to their corresponding driver versions for Windows installers
+# Source: https://developer.nvidia.com/cuda-toolkit-archive
+declare -A CUDA_DRIVER_MAP=(
+    ["12.6"]="12.6.3:561.17"
+    ["12.8"]="12.8.1:572.17"
+    ["12.9"]="12.9.1:576.40"
+    ["13.0"]="13.0.1:578.22"
+)
+
+install_mingw() {
+    echo "Installing mingw-w64 cross-compiler..."
+
+    apt-get update
+    apt-get install -y --no-install-recommends \
+        g++-mingw-w64-x86-64 \
+        mingw-w64-tools \
+        p7zip-full \
+        wget
+
+    # Verify installation
+    x86_64-w64-mingw32-g++ --version
+
+    # Cleanup
+    apt-get clean
+    rm -rf /var/lib/apt/lists/*
+
+    echo "mingw-w64 installation complete"
+}
+
+get_torch_cuda_version() {
+    # Query PyTorch for its CUDA version
+    python3 -c "import torch; print(torch.version.cuda)" 2>/dev/null || echo ""
+}
+
+install_windows_cuda() {
+    # Get CUDA version from torch
+    TORCH_CUDA_VERSION=$(get_torch_cuda_version)
+
+    if [ -z "${TORCH_CUDA_VERSION}" ] || [ "${TORCH_CUDA_VERSION}" = "None" ]; then
+        echo "ERROR: Could not detect CUDA version from PyTorch."
+        echo "Make sure PyTorch with CUDA support is installed before running this script."
+        exit 1
+    fi
+
+    echo "Detected PyTorch CUDA version: ${TORCH_CUDA_VERSION}"
+
+    # Extract major.minor version (e.g., "12.8" from "12.8.1" or "12.8")
+    CUDA_MAJOR_MINOR=$(echo "${TORCH_CUDA_VERSION}" | cut -d. -f1,2)
+
+    # Look up the full version and driver version
+    if [ -z "${CUDA_DRIVER_MAP[${CUDA_MAJOR_MINOR}]}" ]; then
+        echo "ERROR: CUDA version ${CUDA_MAJOR_MINOR} is not in the known version map."
+        echo "Known versions: ${!CUDA_DRIVER_MAP[*]}"
+        exit 1
+    fi
+
+    CUDA_INFO="${CUDA_DRIVER_MAP[${CUDA_MAJOR_MINOR}]}"
+    CUDA_VERSION=$(echo "${CUDA_INFO}" | cut -d: -f1)
+    CUDA_DRIVER_VERSION=$(echo "${CUDA_INFO}" | cut -d: -f2)
+
+    echo "Using CUDA ${CUDA_VERSION} with driver ${CUDA_DRIVER_VERSION}"
+
+    echo "Installing Windows CUDA toolkit ${CUDA_VERSION}..."
+
+    mkdir -p "${INSTALL_DIR}"
+    cd "${INSTALL_DIR}"
+
+    CUDA_INSTALLER="cuda_${CUDA_VERSION}_${CUDA_DRIVER_VERSION}_windows.exe"
+    CUDA_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${CUDA_INSTALLER}"
+
+    # Check if already downloaded and extracted
+    if [ -d "${INSTALL_DIR}/extracted/cuda_cudart" ]; then
+        echo "Windows CUDA toolkit already installed, skipping download..."
+        return 0
+    fi
+
+    echo "Downloading CUDA installer from ${CUDA_URL}..."
+    wget -q "${CUDA_URL}" -O "${CUDA_INSTALLER}"
+
+    echo "Extracting CUDA toolkit..."
+    7z x "${CUDA_INSTALLER}" -o"extracted" -y
+
+    # Clean up installer to save space
+    rm -f "${CUDA_INSTALLER}"
+
+    echo "Windows CUDA toolkit installation complete"
+    echo "WINDOWS_CUDA_HOME=${INSTALL_DIR}/extracted/cuda_cudart/cudart"
+}
+
+# Parse command line arguments
+INSTALL_MINGW=false
+INSTALL_CUDA=false
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --mingw)
+            INSTALL_MINGW=true
+            shift
+            ;;
+        --cuda)
+            INSTALL_CUDA=true
+            shift
+            ;;
+        --all)
+            INSTALL_MINGW=true
+            INSTALL_CUDA=true
+            shift
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Usage: $0 [--mingw] [--cuda] [--all]"
+            exit 1
+            ;;
+    esac
+done
+
+# Default to installing everything if no options specified
+if [ "${INSTALL_MINGW}" = false ] && [ "${INSTALL_CUDA}" = false ]; then
+    INSTALL_MINGW=true
+    INSTALL_CUDA=true
+fi
+
+if [ "${INSTALL_MINGW}" = true ]; then
+    install_mingw
+fi
+
+if [ "${INSTALL_CUDA}" = true ]; then
+    install_windows_cuda
+fi
+
+echo "Installation complete"
diff --git a/.ci/docker/common/install_pytorch_cuda.sh b/.ci/docker/common/install_pytorch_cuda.sh
new file mode 100644
index 00000000000..b75f8d564e2
--- /dev/null
+++ b/.ci/docker/common/install_pytorch_cuda.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Install PyTorch with CUDA support from prebuilt wheels
+# This is used for the cuda-windows Docker image to get a specific CUDA version
+
+set -ex
+
+# shellcheck source=/dev/null
+source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
+
+# Default CUDA version if not specified
+CUDA_VERSION="${CUDA_VERSION:-12.8}"
+
+# Convert CUDA version to PyTorch wheel suffix (e.g., 12.8 -> cu128)
+CUDA_SUFFIX="cu$(echo ${CUDA_VERSION} | tr -d '.')"
+
+echo "Installing PyTorch with CUDA ${CUDA_VERSION} (${CUDA_SUFFIX})..."
+
+# Install PyTorch from nightly with specific CUDA version
+pip_install torch torchvision torchaudio --index-url "https://download.pytorch.org/whl/nightly/${CUDA_SUFFIX}"
+
+# Verify installation
+python3 -c "import torch; print(f'PyTorch {torch.__version__} installed with CUDA {torch.version.cuda}')"
+
+echo "PyTorch CUDA installation complete"
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index b7478df5489..118873d9155 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -98,5 +98,17 @@ ARG QNN_SDK
 
 ARG MEDIATEK_SDK
 
+ARG CUDA_WINDOWS_CROSS_COMPILE
+ARG CUDA_VERSION
+COPY ./common/install_pytorch_cuda.sh install_pytorch_cuda.sh
+COPY ./common/install_cuda_windows_cross_compile.sh install_cuda_windows_cross_compile.sh
+COPY ./common/utils.sh utils.sh
+RUN if [ -n "${CUDA_WINDOWS_CROSS_COMPILE}" ]; then \
+    CUDA_VERSION=${CUDA_VERSION} bash ./install_pytorch_cuda.sh && \
+    bash ./install_cuda_windows_cross_compile.sh; \
+    fi
+RUN rm -f install_pytorch_cuda.sh install_cuda_windows_cross_compile.sh utils.sh
+ENV WINDOWS_CUDA_HOME=/opt/cuda-windows/extracted/cuda_cudart/cudart
+
 USER ci-user
 CMD ["bash"]
diff --git a/.github/workflows/cuda-windows.yml b/.github/workflows/cuda-windows.yml
index 98e950aec05..3318697abd7 100644
--- a/.github/workflows/cuda-windows.yml
+++ b/.github/workflows/cuda-windows.yml
@@ -1,7 +1,7 @@
 # Test ExecuTorch CUDA Windows Cross-Compilation Export
 # This workflow tests model export targeting CUDA Windows using optimum-executorch.
-# It runs on a Linux machine with CUDA and uses the install_windows_cuda_deps.py
-# script to install the Windows CUDA cross-compilation dependencies.
+# It runs on a Linux machine with CUDA and uses the executorch-ubuntu-22.04-cuda-windows
+# Docker image which has mingw pre-installed for Windows cross-compilation.
 
 name: Test CUDA Windows Export
 
@@ -53,20 +53,24 @@ jobs:
       secrets-env: EXECUTORCH_HF_TOKEN
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: 12.6
-      use-custom-docker-registry: false
+      gpu-arch-version: 12.8
+      docker-image: ci-image:executorch-ubuntu-22.04-cuda-windows
       submodules: recursive
       upload-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-windows-${{ matrix.quant }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
 
-        echo "::group::Setup ExecuTorch"
-        ./install_executorch.sh
+        echo "::group::Verify pre-installed dependencies"
+        x86_64-w64-mingw32-g++ --version
+        python3 -c "import torch; print(f'PyTorch {torch.__version__} with CUDA {torch.version.cuda}')"
+        echo "WINDOWS_CUDA_HOME=${WINDOWS_CUDA_HOME}"
+        ls -la "${WINDOWS_CUDA_HOME}"
         echo "::endgroup::"
 
-        echo "::group::Install Windows CUDA Dependencies"
-        python scripts/install_windows_cuda_deps.py
+        echo "::group::Setup ExecuTorch"
+        # Use --use-pt-pinned-commit to skip reinstalling PyTorch (already in Docker with CUDA support)
+        ./install_executorch.sh --use-pt-pinned-commit
         echo "::endgroup::"
 
         echo "::group::Setup Huggingface"
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index e3b72a6bcd6..7243c23dc03 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -41,7 +41,8 @@ jobs:
           executorch-ubuntu-22.04-zephyr-sdk,
           executorch-ubuntu-22.04-qnn-sdk,
           executorch-ubuntu-22.04-mediatek-sdk,
-          executorch-ubuntu-22.04-clang12-android
+          executorch-ubuntu-22.04-clang12-android,
+          executorch-ubuntu-22.04-cuda-windows
         ]
         include:
           - docker-image-name: executorch-ubuntu-22.04-gcc11-aarch64

From 0db8b5c8afc824587c36c51cdd0d59817c55675f Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Mon, 15 Dec 2025 14:12:49 -0800
Subject: [PATCH 04/15] build cuda windows docker on cuda machine

---
 .github/workflows/docker-builds.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 7243c23dc03..0fa4d3685f7 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -41,12 +41,13 @@ jobs:
           executorch-ubuntu-22.04-zephyr-sdk,
           executorch-ubuntu-22.04-qnn-sdk,
           executorch-ubuntu-22.04-mediatek-sdk,
-          executorch-ubuntu-22.04-clang12-android,
-          executorch-ubuntu-22.04-cuda-windows
+          executorch-ubuntu-22.04-clang12-android
         ]
         include:
           - docker-image-name: executorch-ubuntu-22.04-gcc11-aarch64
             runner: linux.arm64.2xlarge
+          - docker-image-name: executorch-ubuntu-22.04-cuda-windows
+            runner: linux.g5.4xlarge.nvidia.gpu
 
     runs-on: [self-hosted, "${{ matrix.runner }}"]
     env:

From cc2b1ae72c62e554fb66176fe007378294f1f393 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Mon, 15 Dec 2025 15:54:15 -0800
Subject: [PATCH 05/15] if you can believe it claude hallucinated the drivers

---
 .ci/docker/common/install_cuda_windows_cross_compile.sh | 5 ++---
 scripts/install_windows_cuda_deps.py                    | 6 +++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/.ci/docker/common/install_cuda_windows_cross_compile.sh b/.ci/docker/common/install_cuda_windows_cross_compile.sh
index 21d4fa76a72..a7e102cd137 100644
--- a/.ci/docker/common/install_cuda_windows_cross_compile.sh
+++ b/.ci/docker/common/install_cuda_windows_cross_compile.sh
@@ -15,9 +15,8 @@ INSTALL_DIR="${WINDOWS_CUDA_INSTALL_DIR:-/opt/cuda-windows}"
 # Source: https://developer.nvidia.com/cuda-toolkit-archive
 declare -A CUDA_DRIVER_MAP=(
     ["12.6"]="12.6.3:561.17"
-    ["12.8"]="12.8.1:572.17"
-    ["12.9"]="12.9.1:576.40"
-    ["13.0"]="13.0.1:578.22"
+    ["12.8"]="12.8.1:572.61"
+    ["12.9"]="12.9.1:576.57"
 )
 
 install_mingw() {
diff --git a/scripts/install_windows_cuda_deps.py b/scripts/install_windows_cuda_deps.py
index 30611dcaaed..632305c26fc 100755
--- a/scripts/install_windows_cuda_deps.py
+++ b/scripts/install_windows_cuda_deps.py
@@ -24,10 +24,10 @@
 # Source: https://developer.nvidia.com/cuda-toolkit-archive
 CUDA_DRIVER_VERSION_MAP = {
     # CUDA 12.9.x
-    "12.9.1": "576.40",
-    "12.9.0": "576.02",
+    "12.9.1": "576.57",
+    "12.9.0": "576.33",
     # CUDA 12.8.x
-    "12.8.1": "572.17",
+    "12.8.1": "572.61",
     "12.8.0": "571.96",
     # CUDA 12.6.x
     "12.6.3": "561.17",

From b5d1b348b034ee0b548f12a1b7feeaaf056e3c30 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Mon, 15 Dec 2025 20:17:49 -0800
Subject: [PATCH 06/15] try conda

---
 .ci/docker/common/install_cuda_windows_cross_compile.sh | 4 ++--
 .ci/docker/common/install_pytorch_cuda.sh               | 2 +-
 .github/workflows/cuda-windows.yml                      | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.ci/docker/common/install_cuda_windows_cross_compile.sh b/.ci/docker/common/install_cuda_windows_cross_compile.sh
index a7e102cd137..c350d822480 100644
--- a/.ci/docker/common/install_cuda_windows_cross_compile.sh
+++ b/.ci/docker/common/install_cuda_windows_cross_compile.sh
@@ -40,8 +40,8 @@ install_mingw() {
 }
 
 get_torch_cuda_version() {
-    # Query PyTorch for its CUDA version
-    python3 -c "import torch; print(torch.version.cuda)" 2>/dev/null || echo ""
+    # Query PyTorch for its CUDA version using conda environment
+    conda run -n "py_${PYTHON_VERSION}" python3 -c "import torch; print(torch.version.cuda)" 2>/dev/null || echo ""
 }
 
 install_windows_cuda() {
diff --git a/.ci/docker/common/install_pytorch_cuda.sh b/.ci/docker/common/install_pytorch_cuda.sh
index b75f8d564e2..f5b0396354d 100644
--- a/.ci/docker/common/install_pytorch_cuda.sh
+++ b/.ci/docker/common/install_pytorch_cuda.sh
@@ -25,6 +25,6 @@ echo "Installing PyTorch with CUDA ${CUDA_VERSION} (${CUDA_SUFFIX})..."
 pip_install torch torchvision torchaudio --index-url "https://download.pytorch.org/whl/nightly/${CUDA_SUFFIX}"
 
 # Verify installation
-python3 -c "import torch; print(f'PyTorch {torch.__version__} installed with CUDA {torch.version.cuda}')"
+conda_run python3 -c "import torch; print(f'PyTorch {torch.__version__} installed with CUDA {torch.version.cuda}')"
 
 echo "PyTorch CUDA installation complete"
diff --git a/.github/workflows/cuda-windows.yml b/.github/workflows/cuda-windows.yml
index 3318697abd7..618c6559fb6 100644
--- a/.github/workflows/cuda-windows.yml
+++ b/.github/workflows/cuda-windows.yml
@@ -63,7 +63,7 @@ jobs:
 
         echo "::group::Verify pre-installed dependencies"
         x86_64-w64-mingw32-g++ --version
-        python3 -c "import torch; print(f'PyTorch {torch.__version__} with CUDA {torch.version.cuda}')"
+        conda run -n "py_${PYTHON_VERSION}" python3 -c "import torch; print(f'PyTorch {torch.__version__} with CUDA {torch.version.cuda}')"
         echo "WINDOWS_CUDA_HOME=${WINDOWS_CUDA_HOME}"
         ls -la "${WINDOWS_CUDA_HOME}"
         echo "::endgroup::"

From 10909a015737a7f69f3aa95a762e5e234713d15d Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Tue, 16 Dec 2025 09:47:59 -0800
Subject: [PATCH 07/15] perms issue

---
 .ci/docker/common/install_cuda_windows_cross_compile.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.ci/docker/common/install_cuda_windows_cross_compile.sh b/.ci/docker/common/install_cuda_windows_cross_compile.sh
index c350d822480..e9310f7bf4c 100644
--- a/.ci/docker/common/install_cuda_windows_cross_compile.sh
+++ b/.ci/docker/common/install_cuda_windows_cross_compile.sh
@@ -92,6 +92,9 @@ install_windows_cuda() {
     echo "Extracting CUDA toolkit..."
     7z x "${CUDA_INSTALLER}" -o"extracted" -y
 
+    # Fix permissions so ci-user can access the files
+    chmod -R a+rX "${INSTALL_DIR}"
+
     # Clean up installer to save space
     rm -f "${CUDA_INSTALLER}"
 

From 4dde99f7c976f6c4bfb864de0a862bbe82af2c51 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Tue, 16 Dec 2025 12:04:48 -0800
Subject: [PATCH 08/15] try bumping pin

---
 torch_pin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_pin.py b/torch_pin.py
index e934463cb70..4f86c779974 100644
--- a/torch_pin.py
+++ b/torch_pin.py
@@ -1,2 +1,2 @@
 TORCH_VERSION = "2.10.0"
-NIGHTLY_VERSION = "dev20251120"
+NIGHTLY_VERSION = "dev20251216"

From 550647db5f03ef6e8312764a4f441ca10ad56d85 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Tue, 16 Dec 2025 14:39:12 -0800
Subject: [PATCH 09/15] pin bump 2

---
 torch_pin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_pin.py b/torch_pin.py
index 4f86c779974..ab3f9c1c027 100644
--- a/torch_pin.py
+++ b/torch_pin.py
@@ -1,2 +1,2 @@
-TORCH_VERSION = "2.10.0"
+TORCH_VERSION = "2.11.0"
 NIGHTLY_VERSION = "dev20251216"

From 4e9a1f4a7a2b0e84404db234c8223c250fa0c78a Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Tue, 16 Dec 2025 17:47:43 -0800
Subject: [PATCH 10/15] pin 3

---
 torch_pin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_pin.py b/torch_pin.py
index ab3f9c1c027..17bde97f11b 100644
--- a/torch_pin.py
+++ b/torch_pin.py
@@ -1,2 +1,2 @@
 TORCH_VERSION = "2.11.0"
-NIGHTLY_VERSION = "dev20251216"
+NIGHTLY_VERSION = "dev20251214"

From 43bb970d916cb5f0b133019a699c95565340028e Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Wed, 17 Dec 2025 10:27:02 -0800
Subject: [PATCH 11/15] add nvcc to docker

---
 .ci/docker/common/install_cuda.sh | 57 +++++++++++++++++++++++++++++++
 .ci/docker/ubuntu/Dockerfile      |  9 ++++-
 2 files changed, 65 insertions(+), 1 deletion(-)
 create mode 100644 .ci/docker/common/install_cuda.sh

diff --git a/.ci/docker/common/install_cuda.sh b/.ci/docker/common/install_cuda.sh
new file mode 100644
index 00000000000..8464fba0747
--- /dev/null
+++ b/.ci/docker/common/install_cuda.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Install Linux CUDA toolkit
+# This installs nvcc and other CUDA development tools needed for compiling CUDA code
+
+set -ex
+
+# CUDA version must be specified (e.g., 12.8)
+CUDA_VERSION="${CUDA_VERSION:?CUDA_VERSION must be set}"
+
+# Convert version format (e.g., 12.8 -> 12-8 for package names)
+CUDA_VERSION_DASH=$(echo "${CUDA_VERSION}" | tr '.' '-')
+
+# Add NVIDIA package repository
+apt-get update
+apt-get install -y --no-install-recommends \
+    gnupg2 \
+    ca-certificates \
+    wget
+
+# Download and install the CUDA keyring
+wget -q "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb" -O /tmp/cuda-keyring.deb
+dpkg -i /tmp/cuda-keyring.deb
+rm /tmp/cuda-keyring.deb
+
+apt-get update
+
+# Install CUDA toolkit (nvcc and development libraries)
+# We install a minimal set of packages needed for compilation:
+# - cuda-nvcc: The CUDA compiler
+# - cuda-cudart-dev: CUDA runtime development files
+# - cuda-nvrtc-dev: CUDA runtime compilation library
+# - libcublas-dev: cuBLAS development files
+# - libcusparse-dev: cuSPARSE development files
+# - libcufft-dev: cuFFT development files
+apt-get install -y --no-install-recommends \
+    "cuda-nvcc-${CUDA_VERSION_DASH}" \
+    "cuda-cudart-dev-${CUDA_VERSION_DASH}" \
+    "cuda-nvrtc-dev-${CUDA_VERSION_DASH}" \
+    "libcublas-dev-${CUDA_VERSION_DASH}" \
+    "libcusparse-dev-${CUDA_VERSION_DASH}" \
+    "libcufft-dev-${CUDA_VERSION_DASH}"
+
+# Clean up
+apt-get clean
+rm -rf /var/lib/apt/lists/*
+
+# Verify installation
+/usr/local/cuda-${CUDA_VERSION}/bin/nvcc --version
+
+echo "CUDA ${CUDA_VERSION} toolkit installation complete"
+echo "CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}"
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 118873d9155..24e34e6189b 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -100,14 +100,21 @@ ARG MEDIATEK_SDK
 
 ARG CUDA_WINDOWS_CROSS_COMPILE
 ARG CUDA_VERSION
+COPY ./common/install_cuda.sh install_cuda.sh
 COPY ./common/install_pytorch_cuda.sh install_pytorch_cuda.sh
 COPY ./common/install_cuda_windows_cross_compile.sh install_cuda_windows_cross_compile.sh
 COPY ./common/utils.sh utils.sh
 RUN if [ -n "${CUDA_WINDOWS_CROSS_COMPILE}" ]; then \
+    CUDA_VERSION=${CUDA_VERSION} bash ./install_cuda.sh && \
     CUDA_VERSION=${CUDA_VERSION} bash ./install_pytorch_cuda.sh && \
     bash ./install_cuda_windows_cross_compile.sh; \
     fi
-RUN rm -f install_pytorch_cuda.sh install_cuda_windows_cross_compile.sh utils.sh
+RUN rm -f install_cuda.sh install_pytorch_cuda.sh install_cuda_windows_cross_compile.sh utils.sh
+# Set up CUDA environment for Linux compilation (nvcc, etc.)
+ENV CUDA_HOME=/usr/local/cuda
+ENV PATH=${CUDA_HOME}/bin:${PATH}
+ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+# Windows CUDA for cross-compilation
 ENV WINDOWS_CUDA_HOME=/opt/cuda-windows/extracted/cuda_cudart/cudart
 
 USER ci-user

From dc7087b33942cbb7f4e86d7ce31d08454379531d Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Wed, 17 Dec 2025 12:10:32 -0800
Subject: [PATCH 12/15] try not using docker again

---
 .github/workflows/cuda-windows.yml | 38 ++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/cuda-windows.yml b/.github/workflows/cuda-windows.yml
index 618c6559fb6..b540ebabbf7 100644
--- a/.github/workflows/cuda-windows.yml
+++ b/.github/workflows/cuda-windows.yml
@@ -1,7 +1,7 @@
 # Test ExecuTorch CUDA Windows Cross-Compilation Export
 # This workflow tests model export targeting CUDA Windows using optimum-executorch.
-# It runs on a Linux machine with CUDA and uses the executorch-ubuntu-22.04-cuda-windows
-# Docker image which has mingw pre-installed for Windows cross-compilation.
+# It runs on a Linux machine with CUDA and installs mingw + Windows CUDA SDK at runtime
+# for Windows cross-compilation.
 
 name: Test CUDA Windows Export
 
@@ -54,23 +54,47 @@ jobs:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
       gpu-arch-version: 12.8
-      docker-image: ci-image:executorch-ubuntu-22.04-cuda-windows
+      use-custom-docker-registry: false
       submodules: recursive
       upload-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-windows-${{ matrix.quant }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
 
-        echo "::group::Verify pre-installed dependencies"
+        echo "::group::Install Windows cross-compilation dependencies"
+        # Install mingw-w64 cross-compiler
+        sudo apt-get update
+        sudo apt-get install -y --no-install-recommends g++-mingw-w64-x86-64 mingw-w64-tools p7zip-full
         x86_64-w64-mingw32-g++ --version
-        conda run -n "py_${PYTHON_VERSION}" python3 -c "import torch; print(f'PyTorch {torch.__version__} with CUDA {torch.version.cuda}')"
+
+        # Download and extract Windows CUDA toolkit
+        # We need this for cross-compiling CUDA code for Windows
+        # Note: CUDA 12.8 installer is versioned as 12.8.1 with driver 572.61
+        CUDA_INSTALLER_VERSION="12.8.1"
+        CUDA_DRIVER_VERSION="572.61"
+        WINDOWS_CUDA_INSTALL_DIR="/tmp/cuda-windows"
+        mkdir -p "${WINDOWS_CUDA_INSTALL_DIR}"
+
+        CUDA_INSTALLER="cuda_${CUDA_INSTALLER_VERSION}_${CUDA_DRIVER_VERSION}_windows.exe"
+        CUDA_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_INSTALLER_VERSION}/local_installers/${CUDA_INSTALLER}"
+
+        echo "Downloading Windows CUDA toolkit from ${CUDA_URL}..."
+        wget -q "${CUDA_URL}" -O "${WINDOWS_CUDA_INSTALL_DIR}/${CUDA_INSTALLER}"
+
+        echo "Extracting Windows CUDA toolkit..."
+        7z x "${WINDOWS_CUDA_INSTALL_DIR}/${CUDA_INSTALLER}" -o"${WINDOWS_CUDA_INSTALL_DIR}/extracted" -y
+
+        # Clean up installer
+        rm -f "${WINDOWS_CUDA_INSTALL_DIR}/${CUDA_INSTALLER}"
+
+        # Set environment variable for Windows CUDA
+        export WINDOWS_CUDA_HOME="${WINDOWS_CUDA_INSTALL_DIR}/extracted/cuda_cudart/cudart"
         echo "WINDOWS_CUDA_HOME=${WINDOWS_CUDA_HOME}"
         ls -la "${WINDOWS_CUDA_HOME}"
         echo "::endgroup::"
 
         echo "::group::Setup ExecuTorch"
-        # Use --use-pt-pinned-commit to skip reinstalling PyTorch (already in Docker with CUDA support)
-        ./install_executorch.sh --use-pt-pinned-commit
+        ./install_executorch.sh
         echo "::endgroup::"
 
         echo "::group::Setup Huggingface"

From ba3b9a04363f8aff766a002e8879609d52afca1d Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Wed, 17 Dec 2025 14:04:32 -0800
Subject: [PATCH 13/15] back to docker

---
 .ci/docker/ubuntu/Dockerfile       |  3 ++-
 .github/workflows/cuda-windows.yml | 38 ++++++------------------------
 2 files changed, 9 insertions(+), 32 deletions(-)

diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 24e34e6189b..fab9df65795 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -113,7 +113,8 @@ RUN rm -f install_cuda.sh install_pytorch_cuda.sh install_cuda_windows_cross_com
 # Set up CUDA environment for Linux compilation (nvcc, etc.)
 ENV CUDA_HOME=/usr/local/cuda
 ENV PATH=${CUDA_HOME}/bin:${PATH}
-ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+# Ensure system libstdc++ is found before conda's (GLIBCXX_3.4.30 compatibility)
+ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
 # Windows CUDA for cross-compilation
 ENV WINDOWS_CUDA_HOME=/opt/cuda-windows/extracted/cuda_cudart/cudart
 
diff --git a/.github/workflows/cuda-windows.yml b/.github/workflows/cuda-windows.yml
index b540ebabbf7..17081bad80a 100644
--- a/.github/workflows/cuda-windows.yml
+++ b/.github/workflows/cuda-windows.yml
@@ -1,7 +1,7 @@
 # Test ExecuTorch CUDA Windows Cross-Compilation Export
 # This workflow tests model export targeting CUDA Windows using optimum-executorch.
-# It runs on a Linux machine with CUDA and installs mingw + Windows CUDA SDK at runtime
-# for Windows cross-compilation.
+# It runs on a Linux machine with CUDA and uses the executorch-ubuntu-22.04-cuda-windows
+# Docker image which has mingw and Windows CUDA SDK pre-installed for cross-compilation.
 
 name: Test CUDA Windows Export
 
@@ -54,47 +54,23 @@ jobs:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
       gpu-arch-version: 12.8
-      use-custom-docker-registry: false
+      docker-image: ci-image:executorch-ubuntu-22.04-cuda-windows
       submodules: recursive
       upload-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-windows-${{ matrix.quant }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
 
-        echo "::group::Install Windows cross-compilation dependencies"
-        # Install mingw-w64 cross-compiler
-        sudo apt-get update
-        sudo apt-get install -y --no-install-recommends g++-mingw-w64-x86-64 mingw-w64-tools p7zip-full
+        echo "::group::Verify pre-installed dependencies"
         x86_64-w64-mingw32-g++ --version
-
-        # Download and extract Windows CUDA toolkit
-        # We need this for cross-compiling CUDA code for Windows
-        # Note: CUDA 12.8 installer is versioned as 12.8.1 with driver 572.61
-        CUDA_INSTALLER_VERSION="12.8.1"
-        CUDA_DRIVER_VERSION="572.61"
-        WINDOWS_CUDA_INSTALL_DIR="/tmp/cuda-windows"
-        mkdir -p "${WINDOWS_CUDA_INSTALL_DIR}"
-
-        CUDA_INSTALLER="cuda_${CUDA_INSTALLER_VERSION}_${CUDA_DRIVER_VERSION}_windows.exe"
-        CUDA_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_INSTALLER_VERSION}/local_installers/${CUDA_INSTALLER}"
-
-        echo "Downloading Windows CUDA toolkit from ${CUDA_URL}..."
-        wget -q "${CUDA_URL}" -O "${WINDOWS_CUDA_INSTALL_DIR}/${CUDA_INSTALLER}"
-
-        echo "Extracting Windows CUDA toolkit..."
-        7z x "${WINDOWS_CUDA_INSTALL_DIR}/${CUDA_INSTALLER}" -o"${WINDOWS_CUDA_INSTALL_DIR}/extracted" -y
-
-        # Clean up installer
-        rm -f "${WINDOWS_CUDA_INSTALL_DIR}/${CUDA_INSTALLER}"
-
-        # Set environment variable for Windows CUDA
-        export WINDOWS_CUDA_HOME="${WINDOWS_CUDA_INSTALL_DIR}/extracted/cuda_cudart/cudart"
+        nvcc --version
+        python3 -c "import torch; print(f'PyTorch {torch.__version__} with CUDA {torch.version.cuda}')"
         echo "WINDOWS_CUDA_HOME=${WINDOWS_CUDA_HOME}"
         ls -la "${WINDOWS_CUDA_HOME}"
         echo "::endgroup::"
 
         echo "::group::Setup ExecuTorch"
-        ./install_executorch.sh
+        ./install_executorch.sh --use-pt-pinned-commit
         echo "::endgroup::"
 
         echo "::group::Setup Huggingface"

From 19d961d50ac017242ab46b9d683c4081a946933c Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Wed, 17 Dec 2025 14:09:29 -0800
Subject: [PATCH 14/15] remove hallucinated hack

---
 .ci/docker/common/install_conda.sh | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh
index 8c1c7da63d4..7e36af55cac 100755
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@@ -54,25 +54,6 @@ install_pip_dependencies() {
   popd
 }
 
-fix_conda_ubuntu_libstdcxx() {
-  cat /etc/issue
-  # WARNING: This is a HACK from PyTorch core to be able to build PyTorch on 22.04.
-  # Specifically, ubuntu-20+ all comes lib libstdc++ newer than 3.30+, but anaconda
-  # is stuck with 3.29. So, remove libstdc++6.so.3.29 as installed by
-  # https://anaconda.org/anaconda/libstdcxx-ng/files?version=11.2.0
-  #
-  # PyTorch sev: https://github.com/pytorch/pytorch/issues/105248
-  # Ref: https://github.com/pytorch/pytorch/blob/main/.ci/docker/common/install_conda.sh
-  if grep -e "2[02].04." /etc/issue >/dev/null; then
-    rm /opt/conda/envs/py_${PYTHON_VERSION}/lib/libstdc++.so*
-  fi
-}
-
 install_miniconda
 install_python
 install_pip_dependencies
-# Hack breaks the job on aarch64 but is still necessary everywhere
-# else.
-if [ "$(uname -m)" != "aarch64" ]; then
-    fix_conda_ubuntu_libstdcxx
-fi

From daab8f0c76f1922976e194fdeafac5b92419191c Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Wed, 17 Dec 2025 22:31:43 -0800
Subject: [PATCH 15/15] python3 -> python

---
 .ci/docker/common/install_pytorch_cuda.sh | 12 ++++++++++--
 .github/workflows/cuda-windows.yml        |  4 ++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/.ci/docker/common/install_pytorch_cuda.sh b/.ci/docker/common/install_pytorch_cuda.sh
index f5b0396354d..ab836fde063 100644
--- a/.ci/docker/common/install_pytorch_cuda.sh
+++ b/.ci/docker/common/install_pytorch_cuda.sh
@@ -16,15 +16,23 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 # Default CUDA version if not specified
 CUDA_VERSION="${CUDA_VERSION:-12.8}"
 
+# Ensure PYTHON_VERSION is set (should be set by Dockerfile ENV)
+if [ -z "${PYTHON_VERSION}" ]; then
+    echo "ERROR: PYTHON_VERSION environment variable is not set"
+    exit 1
+fi
+
+echo "Using Python version: ${PYTHON_VERSION}"
+
 # Convert CUDA version to PyTorch wheel suffix (e.g., 12.8 -> cu128)
 CUDA_SUFFIX="cu$(echo ${CUDA_VERSION} | tr -d '.')"
 
 echo "Installing PyTorch with CUDA ${CUDA_VERSION} (${CUDA_SUFFIX})..."
 
-# Install PyTorch from nightly with specific CUDA version
+# Install PyTorch from nightly with specific CUDA version into the conda environment
 pip_install torch torchvision torchaudio --index-url "https://download.pytorch.org/whl/nightly/${CUDA_SUFFIX}"
 
 # Verify installation
-conda_run python3 -c "import torch; print(f'PyTorch {torch.__version__} installed with CUDA {torch.version.cuda}')"
+conda_run python -c "import torch; print(f'PyTorch {torch.__version__} installed with CUDA {torch.version.cuda}')"
 
 echo "PyTorch CUDA installation complete"
diff --git a/.github/workflows/cuda-windows.yml b/.github/workflows/cuda-windows.yml
index 17081bad80a..14289641117 100644
--- a/.github/workflows/cuda-windows.yml
+++ b/.github/workflows/cuda-windows.yml
@@ -64,13 +64,13 @@ jobs:
         echo "::group::Verify pre-installed dependencies"
         x86_64-w64-mingw32-g++ --version
         nvcc --version
-        python3 -c "import torch; print(f'PyTorch {torch.__version__} with CUDA {torch.version.cuda}')"
+        python -c "import torch; print(f'PyTorch {torch.__version__} with CUDA {torch.version.cuda}')"
         echo "WINDOWS_CUDA_HOME=${WINDOWS_CUDA_HOME}"
         ls -la "${WINDOWS_CUDA_HOME}"
         echo "::endgroup::"
 
         echo "::group::Setup ExecuTorch"
-        ./install_executorch.sh --use-pt-pinned-commit
+        PYTHON_EXECUTABLE=python ./install_executorch.sh --use-pt-pinned-commit
         echo "::endgroup::"
 
         echo "::group::Setup Huggingface"