From 14e7d4b4841dcea340aa2a5b67f20b734b1194e2 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 18 Dec 2025 00:49:57 -0800
Subject: [PATCH] [slim tensor migration 2/n] update slimtensor namespace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This stack aims to migrate slim tensor into ExecuTorch stack to make it as internal tensor representation of cudabackend.

This diff updates namespaces of slimtensor files to executorch standard. More specific:

1. namespace standalone::slim → namespace executorch::backends::aoti::slim
2. namespace standalone::c10 → namespace executorch::backends::aoti::slim::c10
3. Added torch::executor::SlimTensor backward-compatible alias

Differential Revision: [D89442123](https://our.internmc.facebook.com/intern/diff/D89442123/)

[ghstack-poisoned]
---
 backends/aoti/slim/c10/core/Contiguity.h      |   4 +-
 backends/aoti/slim/c10/core/Device.h          |  15 +-
 backends/aoti/slim/c10/core/DeviceType.h      |   9 +-
 backends/aoti/slim/c10/core/Layout.h          |   4 +-
 backends/aoti/slim/c10/core/MemoryFormat.h    |   6 +-
 backends/aoti/slim/c10/core/Scalar.h          |  68 +--
 backends/aoti/slim/c10/core/ScalarType.h      | 434 ++++++++++--------
 backends/aoti/slim/c10/core/SizesAndStrides.h |   4 +-
 backends/aoti/slim/c10/core/WrapDimMinimal.h  |   8 +-
 backends/aoti/slim/c10/util/Array.h           |   4 +-
 backends/aoti/slim/c10/util/ArrayRef.h        |  40 +-
 backends/aoti/slim/c10/util/BFloat16-inl.h    |  66 +--
 backends/aoti/slim/c10/util/BFloat16-math.h   | 174 ++++---
 backends/aoti/slim/c10/util/BFloat16.h        |   4 +-
 backends/aoti/slim/c10/util/Exception.h       |  23 +-
 .../aoti/slim/c10/util/Float4_e2m1fn_x2.h     |   4 +-
 .../aoti/slim/c10/util/Float8_e4m3fn-inl.h    |  63 ++-
 backends/aoti/slim/c10/util/Float8_e4m3fn.h   |   4 +-
 .../aoti/slim/c10/util/Float8_e4m3fnuz-inl.h  |  73 +--
 backends/aoti/slim/c10/util/Float8_e4m3fnuz.h |   4 +-
 backends/aoti/slim/c10/util/Float8_e5m2-inl.h |  61 +--
 backends/aoti/slim/c10/util/Float8_e5m2.h     |   4 +-
 .../aoti/slim/c10/util/Float8_e5m2fnuz-inl.h  |  73 +--
 backends/aoti/slim/c10/util/Float8_e5m2fnuz.h |   4 +-
 .../aoti/slim/c10/util/Float8_e8m0fnu-inl.h   |  63 ++-
 backends/aoti/slim/c10/util/Float8_e8m0fnu.h  |   7 +-
 backends/aoti/slim/c10/util/Float8_fnuz_cvt.h |   4 +-
 backends/aoti/slim/c10/util/Half-inl.h        |  60 ++-
 backends/aoti/slim/c10/util/Half.h            |   8 +-
 backends/aoti/slim/c10/util/StringUtil.h      |   4 +-
 backends/aoti/slim/c10/util/TypeCast.h        | 133 +++---
 .../aoti/slim/c10/util/TypeSafeSignMath.h     |  13 +-
 backends/aoti/slim/c10/util/accumulate.h      |   4 +-
 backends/aoti/slim/c10/util/bit_cast.h        |   4 +-
 backends/aoti/slim/c10/util/bits.h            |   4 +-
 backends/aoti/slim/c10/util/complex.h         | 107 +++--
 backends/aoti/slim/c10/util/complex_math.h    | 358 ++++++++-------
 backends/aoti/slim/c10/util/complex_utils.h   |  14 +-
 backends/aoti/slim/c10/util/copysign.h        |   4 +-
 .../aoti/slim/c10/util/floating_point_utils.h |   8 +-
 backends/aoti/slim/c10/util/generic_math.h    |  15 +-
 backends/aoti/slim/c10/util/irange.h          |  10 +-
 backends/aoti/slim/c10/util/llvmMathExtras.h  |   6 +-
 backends/aoti/slim/c10/util/overflows.h       |   9 +-
 backends/aoti/slim/c10/util/qint32.h          |   4 +-
 backends/aoti/slim/c10/util/qint8.h           |   4 +-
 backends/aoti/slim/c10/util/quint2x4.h        |   4 +-
 backends/aoti/slim/c10/util/quint4x2.h        |   4 +-
 backends/aoti/slim/c10/util/quint8.h          |   4 +-
 backends/aoti/slim/c10/util/safe_numerics.h   |  13 +-
 backends/aoti/slim/core/SlimTensor.h          | 182 ++++----
 .../aoti/slim/core/SlimTensorResize-incl.h    |  31 +-
 backends/aoti/slim/core/SlimTensorView-incl.h |  33 +-
 backends/aoti/slim/core/Storage.h             | 107 +++--
 backends/aoti/slim/cuda/Guard.h               |  35 +-
 backends/aoti/slim/factory/Empty.h            |  20 +-
 backends/aoti/slim/factory/Factory.h          |  20 +-
 backends/aoti/slim/factory/FromBlob.h         |  20 +-
 backends/aoti/slim/factory/FromScalar.h       |   8 +-
 backends/aoti/slim/factory/Pad.h              |  18 +-
 .../slim/tests/test_slim_tensor_basic.cpp     |  86 +++-
 .../aoti/slim/tests/test_slim_tensor_cuda.cpp | 106 +++--
 backends/aoti/slim/util/SharedPtr.h           |   6 +-
 backends/aoti/slim/util/SizeUtil.h            |  60 ++-
 64 files changed, 1547 insertions(+), 1206 deletions(-)

diff --git a/backends/aoti/slim/c10/core/Contiguity.h b/backends/aoti/slim/c10/core/Contiguity.h
index d5ff49561ab..80db87eb588 100644
--- a/backends/aoti/slim/c10/core/Contiguity.h
+++ b/backends/aoti/slim/c10/core/Contiguity.h
@@ -6,7 +6,7 @@
 #include <algorithm>
 #include <cstdint>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 template <typename T>
 bool _compute_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
@@ -148,4 +148,4 @@ bool _compute_non_overlapping_and_dense(
   return true;
 }
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/core/Device.h b/backends/aoti/slim/c10/core/Device.h
index a9a6d3a8136..02e88a30b1e 100644
--- a/backends/aoti/slim/c10/core/Device.h
+++ b/backends/aoti/slim/c10/core/Device.h
@@ -17,7 +17,7 @@
 
 // Copied from c10/core/DeviceType.h with some modifications
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 namespace detail {
 enum class DeviceStringParsingState {
   kSTART,
@@ -341,18 +341,21 @@ inline std::ostream& operator<<(std::ostream& stream, const Device& device) {
   stream << device.str();
   return stream;
 }
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 namespace std {
 template <>
-struct hash<standalone::c10::Device> {
-  size_t operator()(standalone::c10::Device d) const noexcept {
+struct hash<executorch::backends::aoti::slim::c10::Device> {
+  size_t operator()(
+      executorch::backends::aoti::slim::c10::Device d) const noexcept {
     // Are you here because this static assert failed?  Make sure you ensure
     // that the bitmasking code below is updated accordingly!
     static_assert(
-        sizeof(standalone::c10::DeviceType) == 1, "DeviceType is not 8-bit");
+        sizeof(executorch::backends::aoti::slim::c10::DeviceType) == 1,
+        "DeviceType is not 8-bit");
     static_assert(
-        sizeof(standalone::c10::DeviceIndex) == 1, "DeviceIndex is not 8-bit");
+        sizeof(executorch::backends::aoti::slim::c10::DeviceIndex) == 1,
+        "DeviceIndex is not 8-bit");
     // Note [Hazard when concatenating signed integers]
     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     // We must first convert to a same-sized unsigned type, before promoting to
diff --git a/backends/aoti/slim/c10/core/DeviceType.h b/backends/aoti/slim/c10/core/DeviceType.h
index f2631a48f2d..eb024a3595d 100644
--- a/backends/aoti/slim/c10/core/DeviceType.h
+++ b/backends/aoti/slim/c10/core/DeviceType.h
@@ -15,7 +15,7 @@
 
 #include <executorch/backends/aoti/slim/c10/util/Exception.h>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 enum class DeviceType : int8_t {
   CPU = 0,
   CUDA = 1, // CUDA.
@@ -122,12 +122,13 @@ inline std::ostream& operator<<(std::ostream& stream, DeviceType type) {
   stream << DeviceTypeName(type, /* lower case */ true);
   return stream;
 }
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 namespace std {
 template <>
-struct hash<standalone::c10::DeviceType> {
-  std::size_t operator()(standalone::c10::DeviceType k) const {
+struct hash<executorch::backends::aoti::slim::c10::DeviceType> {
+  std::size_t operator()(
+      executorch::backends::aoti::slim::c10::DeviceType k) const {
     return std::hash<int>()(static_cast<int>(k));
   }
 };
diff --git a/backends/aoti/slim/c10/core/Layout.h b/backends/aoti/slim/c10/core/Layout.h
index 79230f23bb7..4d7b5499088 100644
--- a/backends/aoti/slim/c10/core/Layout.h
+++ b/backends/aoti/slim/c10/core/Layout.h
@@ -5,7 +5,7 @@
 #include <cstdint>
 #include <ostream>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 enum class Layout : int8_t {
   Strided,
   Sparse,
@@ -50,4 +50,4 @@ inline std::ostream& operator<<(std::ostream& stream, c10::Layout layout) {
   }
 }
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/core/MemoryFormat.h b/backends/aoti/slim/c10/core/MemoryFormat.h
index 756caf64f26..68f1a6d7357 100644
--- a/backends/aoti/slim/c10/core/MemoryFormat.h
+++ b/backends/aoti/slim/c10/core/MemoryFormat.h
@@ -25,7 +25,7 @@
 //    Regardless of input tensors format, the output should be in channels_last
 //    format.
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 enum class MemoryFormat : int8_t {
   Contiguous,
   Preserve,
@@ -38,7 +38,7 @@ enum class MemoryFormat : int8_t {
 // the memory format could be preserved, and it was switched to old default
 // behaviour of contiguous
 #define LEGACY_CONTIGUOUS_MEMORY_FORMAT \
-  ::standalone::c10::get_contiguous_memory_format()
+  ::executorch::backends::aoti::slim::c10::get_contiguous_memory_format()
 
 inline MemoryFormat get_contiguous_memory_format() {
   return MemoryFormat::Contiguous;
@@ -288,4 +288,4 @@ inline bool is_channels_last_strides_3d(
   return is_channels_last_strides_3d<int64_t>(sizes, strides);
 }
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/core/Scalar.h b/backends/aoti/slim/c10/core/Scalar.h
index 1c61ecb4704..b46add34946 100644
--- a/backends/aoti/slim/c10/core/Scalar.h
+++ b/backends/aoti/slim/c10/core/Scalar.h
@@ -15,7 +15,7 @@
 
 // Copy-pasted from c10/core/Scalar.h, but dropping SymScalar support
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 /**
  * Scalar represents a 0-dimensional tensor which contains a single element.
@@ -86,22 +86,23 @@ class Scalar {
     v.i = convert<int64_t, bool>(vv);
   }
 
-#define DEFINE_ACCESSOR(type, name)                                   \
-  type to##name() const {                                             \
-    if (Tag::HAS_d == tag) {                                          \
-      return checked_convert<type, double>(v.d, #type);               \
-    } else if (Tag::HAS_z == tag) {                                   \
-      return checked_convert<type, standalone::c10::complex<double>>( \
-          v.z, #type);                                                \
-    }                                                                 \
-    if (Tag::HAS_b == tag) {                                          \
-      return checked_convert<type, bool>(v.i, #type);                 \
-    } else if (Tag::HAS_i == tag) {                                   \
-      return checked_convert<type, int64_t>(v.i, #type);              \
-    } else if (Tag::HAS_u == tag) {                                   \
-      return checked_convert<type, uint64_t>(v.u, #type);             \
-    }                                                                 \
-    STANDALONE_CHECK(false)                                           \
+#define DEFINE_ACCESSOR(type, name)                                            \
+  type to##name() const {                                                      \
+    if (Tag::HAS_d == tag) {                                                   \
+      return checked_convert<type, double>(v.d, #type);                        \
+    } else if (Tag::HAS_z == tag) {                                            \
+      return checked_convert<                                                  \
+          type,                                                                \
+          executorch::backends::aoti::slim::c10::complex<double>>(v.z, #type); \
+    }                                                                          \
+    if (Tag::HAS_b == tag) {                                                   \
+      return checked_convert<type, bool>(v.i, #type);                          \
+    } else if (Tag::HAS_i == tag) {                                            \
+      return checked_convert<type, int64_t>(v.i, #type);                       \
+    } else if (Tag::HAS_u == tag) {                                            \
+      return checked_convert<type, uint64_t>(v.u, #type);                      \
+    }                                                                          \
+    STANDALONE_CHECK(false)                                                    \
   }
 
   // TODO: Support ComplexHalf accessor
@@ -193,8 +194,9 @@ class Scalar {
 
   template <
       typename T,
-      typename std::enable_if_t<!standalone::c10::is_complex<T>::value, int> =
-          0>
+      typename std::enable_if_t<
+          !executorch::backends::aoti::slim::c10::is_complex<T>::value,
+          int> = 0>
   bool equal(T num) const {
     if (isComplex()) {
       auto val = v.z;
@@ -223,7 +225,9 @@ class Scalar {
 
   template <
       typename T,
-      typename std::enable_if_t<standalone::c10::is_complex<T>::value, int> = 0>
+      typename std::enable_if_t<
+          executorch::backends::aoti::slim::c10::is_complex<T>::value,
+          int> = 0>
   bool equal(T num) const {
     if (isComplex()) {
       return v.z == num;
@@ -257,20 +261,20 @@ class Scalar {
     }
   }
 
-  standalone::c10::ScalarType type() const {
+  executorch::backends::aoti::slim::c10::ScalarType type() const {
     if (isComplex()) {
-      return standalone::c10::ScalarType::ComplexDouble;
+      return executorch::backends::aoti::slim::c10::ScalarType::ComplexDouble;
     } else if (isFloatingPoint()) {
-      return standalone::c10::ScalarType::Double;
+      return executorch::backends::aoti::slim::c10::ScalarType::Double;
     } else if (isIntegral(/*includeBool=*/false)) {
       // Represent all integers as long, UNLESS it is unsigned and therefore
       // unrepresentable as long
       if (Tag::HAS_u == tag) {
-        return standalone::c10::ScalarType::UInt64;
+        return executorch::backends::aoti::slim::c10::ScalarType::UInt64;
       }
-      return standalone::c10::ScalarType::Long;
+      return executorch::backends::aoti::slim::c10::ScalarType::Long;
     } else if (isBoolean()) {
-      return standalone::c10::ScalarType::Bool;
+      return executorch::backends::aoti::slim::c10::ScalarType::Bool;
     } else {
       throw std::runtime_error("Unknown scalar type.");
     }
@@ -313,7 +317,7 @@ class Scalar {
     int64_t i;
     // See Note [Meaning of HAS_u]
     uint64_t u;
-    standalone::c10::complex<double> z;
+    executorch::backends::aoti::slim::c10::complex<double> z;
     // NOLINTNEXTLINE(modernize-use-equals-default)
     v_t() {} // default constructor
   } v;
@@ -330,7 +334,8 @@ class Scalar {
   template <
       typename T,
       typename std::enable_if_t<
-          !std::is_integral_v<T> && !standalone::c10::is_complex<T>::value,
+          !std::is_integral_v<T> &&
+              !executorch::backends::aoti::slim::c10::is_complex<T>::value,
           bool>* = nullptr>
   Scalar(T vv, bool) : tag(Tag::HAS_d) {
     v.d = convert<decltype(v.d), T>(vv);
@@ -338,8 +343,9 @@ class Scalar {
 
   template <
       typename T,
-      typename std::enable_if_t<standalone::c10::is_complex<T>::value, bool>* =
-          nullptr>
+      typename std::enable_if_t<
+          executorch::backends::aoti::slim::c10::is_complex<T>::value,
+          bool>* = nullptr>
   Scalar(T vv, bool) : tag(Tag::HAS_z) {
     v.z = convert<decltype(v.z), T>(vv);
   }
@@ -357,4 +363,4 @@ DEFINE_TO(uint32_t, UInt32)
 DEFINE_TO(uint64_t, UInt64)
 #undef DEFINE_TO
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/core/ScalarType.h b/backends/aoti/slim/c10/core/ScalarType.h
index 6daeaad5f2c..6481b3d2c4b 100644
--- a/backends/aoti/slim/c10/core/ScalarType.h
+++ b/backends/aoti/slim/c10/core/ScalarType.h
@@ -26,7 +26,7 @@
 #include <type_traits>
 #include <unordered_map>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 // dummy struct for uint1 to uint7, actual functionality
 // of these dtypes will be implemented in python with Tensor subclass
@@ -60,53 +60,62 @@ struct dummy_int1_7_t {};
 
 // NB: Order matters for this macro; it is relied upon in
 // _promoteTypesLookup and the serialization format.
-#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(_)                  \
-  _(uint8_t, Byte) /* 0 */                                                \
-  _(int8_t, Char) /* 1 */                                                 \
-  _(int16_t, Short) /* 2 */                                               \
-  _(int, Int) /* 3 */                                                     \
-  _(int64_t, Long) /* 4 */                                                \
-  _(standalone::c10::Half, Half) /* 5 */                                  \
-  _(float, Float) /* 6 */                                                 \
-  _(double, Double) /* 7 */                                               \
-  _(standalone::c10::complex<standalone::c10::Half>, ComplexHalf) /* 8 */ \
-  _(standalone::c10::complex<float>, ComplexFloat) /* 9 */                \
-  _(standalone::c10::complex<double>, ComplexDouble) /* 10 */             \
-  _(bool, Bool) /* 11 */                                                  \
-  _(standalone::c10::qint8, QInt8) /* 12 */                               \
-  _(standalone::c10::quint8, QUInt8) /* 13 */                             \
-  _(standalone::c10::qint32, QInt32) /* 14 */                             \
-  _(standalone::c10::BFloat16, BFloat16) /* 15 */                         \
-  _(standalone::c10::quint4x2, QUInt4x2) /* 16 */                         \
-  _(standalone::c10::quint2x4, QUInt2x4) /* 17 */                         \
-  _(standalone::c10::bits1x8, Bits1x8) /* 18 */                           \
-  _(standalone::c10::bits2x4, Bits2x4) /* 19 */                           \
-  _(standalone::c10::bits4x2, Bits4x2) /* 20 */                           \
-  _(standalone::c10::bits8, Bits8) /* 21 */                               \
-  _(standalone::c10::bits16, Bits16) /* 22 */                             \
-  _(standalone::c10::Float8_e5m2, Float8_e5m2) /* 23 */                   \
-  _(standalone::c10::Float8_e4m3fn, Float8_e4m3fn) /* 24 */               \
-  _(standalone::c10::Float8_e5m2fnuz, Float8_e5m2fnuz) /* 25 */           \
-  _(standalone::c10::Float8_e4m3fnuz, Float8_e4m3fnuz) /* 26 */           \
-  _(uint16_t, UInt16) /* 27 */                                            \
-  _(uint32_t, UInt32) /* 28 */                                            \
-  _(uint64_t, UInt64) /* 29 */                                            \
-  _(standalone::c10::dummy_uint1_7_t<1>, UInt1) /* 30 */                  \
-  _(standalone::c10::dummy_uint1_7_t<2>, UInt2) /* 31 */                  \
-  _(standalone::c10::dummy_uint1_7_t<3>, UInt3) /* 32 */                  \
-  _(standalone::c10::dummy_uint1_7_t<4>, UInt4) /* 33 */                  \
-  _(standalone::c10::dummy_uint1_7_t<5>, UInt5) /* 34 */                  \
-  _(standalone::c10::dummy_uint1_7_t<6>, UInt6) /* 35 */                  \
-  _(standalone::c10::dummy_uint1_7_t<7>, UInt7) /* 36 */                  \
-  _(standalone::c10::dummy_int1_7_t<1>, Int1) /* 37 */                    \
-  _(standalone::c10::dummy_int1_7_t<2>, Int2) /* 38 */                    \
-  _(standalone::c10::dummy_int1_7_t<3>, Int3) /* 39 */                    \
-  _(standalone::c10::dummy_int1_7_t<4>, Int4) /* 40 */                    \
-  _(standalone::c10::dummy_int1_7_t<5>, Int5) /* 41 */                    \
-  _(standalone::c10::dummy_int1_7_t<6>, Int6) /* 42 */                    \
-  _(standalone::c10::dummy_int1_7_t<7>, Int7) /* 43 */                    \
-  _(standalone::c10::Float8_e8m0fnu, Float8_e8m0fnu) /* 44 */             \
-  _(standalone::c10::Float4_e2m1fn_x2, Float4_e2m1fn_x2) /* 45 */
+#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(_)                       \
+  _(uint8_t, Byte) /* 0 */                                                     \
+  _(int8_t, Char) /* 1 */                                                      \
+  _(int16_t, Short) /* 2 */                                                    \
+  _(int, Int) /* 3 */                                                          \
+  _(int64_t, Long) /* 4 */                                                     \
+  _(executorch::backends::aoti::slim::c10::Half, Half) /* 5 */                 \
+  _(float, Float) /* 6 */                                                      \
+  _(double, Double) /* 7 */                                                    \
+  _(executorch::backends::aoti::slim::c10::complex<                            \
+        executorch::backends::aoti::slim::c10::Half>,                          \
+    ComplexHalf) /* 8 */                                                       \
+  _(executorch::backends::aoti::slim::c10::complex<float>,                     \
+    ComplexFloat) /* 9 */                                                      \
+  _(executorch::backends::aoti::slim::c10::complex<double>,                    \
+    ComplexDouble) /* 10 */                                                    \
+  _(bool, Bool) /* 11 */                                                       \
+  _(executorch::backends::aoti::slim::c10::qint8, QInt8) /* 12 */              \
+  _(executorch::backends::aoti::slim::c10::quint8, QUInt8) /* 13 */            \
+  _(executorch::backends::aoti::slim::c10::qint32, QInt32) /* 14 */            \
+  _(executorch::backends::aoti::slim::c10::BFloat16, BFloat16) /* 15 */        \
+  _(executorch::backends::aoti::slim::c10::quint4x2, QUInt4x2) /* 16 */        \
+  _(executorch::backends::aoti::slim::c10::quint2x4, QUInt2x4) /* 17 */        \
+  _(executorch::backends::aoti::slim::c10::bits1x8, Bits1x8) /* 18 */          \
+  _(executorch::backends::aoti::slim::c10::bits2x4, Bits2x4) /* 19 */          \
+  _(executorch::backends::aoti::slim::c10::bits4x2, Bits4x2) /* 20 */          \
+  _(executorch::backends::aoti::slim::c10::bits8, Bits8) /* 21 */              \
+  _(executorch::backends::aoti::slim::c10::bits16, Bits16) /* 22 */            \
+  _(executorch::backends::aoti::slim::c10::Float8_e5m2, Float8_e5m2) /* 23 */  \
+  _(executorch::backends::aoti::slim::c10::Float8_e4m3fn,                      \
+    Float8_e4m3fn) /* 24 */                                                    \
+  _(executorch::backends::aoti::slim::c10::Float8_e5m2fnuz,                    \
+    Float8_e5m2fnuz) /* 25 */                                                  \
+  _(executorch::backends::aoti::slim::c10::Float8_e4m3fnuz,                    \
+    Float8_e4m3fnuz) /* 26 */                                                  \
+  _(uint16_t, UInt16) /* 27 */                                                 \
+  _(uint32_t, UInt32) /* 28 */                                                 \
+  _(uint64_t, UInt64) /* 29 */                                                 \
+  _(executorch::backends::aoti::slim::c10::dummy_uint1_7_t<1>, UInt1) /* 30 */ \
+  _(executorch::backends::aoti::slim::c10::dummy_uint1_7_t<2>, UInt2) /* 31 */ \
+  _(executorch::backends::aoti::slim::c10::dummy_uint1_7_t<3>, UInt3) /* 32 */ \
+  _(executorch::backends::aoti::slim::c10::dummy_uint1_7_t<4>, UInt4) /* 33 */ \
+  _(executorch::backends::aoti::slim::c10::dummy_uint1_7_t<5>, UInt5) /* 34 */ \
+  _(executorch::backends::aoti::slim::c10::dummy_uint1_7_t<6>, UInt6) /* 35 */ \
+  _(executorch::backends::aoti::slim::c10::dummy_uint1_7_t<7>, UInt7) /* 36 */ \
+  _(executorch::backends::aoti::slim::c10::dummy_int1_7_t<1>, Int1) /* 37 */   \
+  _(executorch::backends::aoti::slim::c10::dummy_int1_7_t<2>, Int2) /* 38 */   \
+  _(executorch::backends::aoti::slim::c10::dummy_int1_7_t<3>, Int3) /* 39 */   \
+  _(executorch::backends::aoti::slim::c10::dummy_int1_7_t<4>, Int4) /* 40 */   \
+  _(executorch::backends::aoti::slim::c10::dummy_int1_7_t<5>, Int5) /* 41 */   \
+  _(executorch::backends::aoti::slim::c10::dummy_int1_7_t<6>, Int6) /* 42 */   \
+  _(executorch::backends::aoti::slim::c10::dummy_int1_7_t<7>, Int7) /* 43 */   \
+  _(executorch::backends::aoti::slim::c10::Float8_e8m0fnu,                     \
+    Float8_e8m0fnu) /* 44 */                                                   \
+  _(executorch::backends::aoti::slim::c10::Float4_e2m1fn_x2,                   \
+    Float4_e2m1fn_x2) /* 45 */
 
 // If you want to support ComplexHalf for real, add ComplexHalf
 // into this macro (and change the name).  But beware: convert()
@@ -115,43 +124,45 @@ struct dummy_int1_7_t {};
 // TODO: To add unsigned int types here, we must define accumulate type.
 // But uint8 currently accumulates into int64, so we would have to make
 // an inconsistent choice for the larger types.  Difficult.
-#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF_F8NZ(_) \
-  _(uint8_t, Byte)                                                      \
-  _(int8_t, Char)                                                       \
-  _(int16_t, Short)                                                     \
-  _(int, Int)                                                           \
-  _(int64_t, Long)                                                      \
-  _(standalone::c10::Half, Half)                                        \
-  _(float, Float)                                                       \
-  _(double, Double)                                                     \
-  _(standalone::c10::complex<float>, ComplexFloat)                      \
-  _(standalone::c10::complex<double>, ComplexDouble)                    \
-  _(bool, Bool)                                                         \
-  _(standalone::c10::BFloat16, BFloat16)                                \
-  _(standalone::c10::Float8_e5m2, Float8_e5m2)                          \
-  _(standalone::c10::Float8_e4m3fn, Float8_e4m3fn)
+#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF_F8NZ(_)    \
+  _(uint8_t, Byte)                                                         \
+  _(int8_t, Char)                                                          \
+  _(int16_t, Short)                                                        \
+  _(int, Int)                                                              \
+  _(int64_t, Long)                                                         \
+  _(executorch::backends::aoti::slim::c10::Half, Half)                     \
+  _(float, Float)                                                          \
+  _(double, Double)                                                        \
+  _(executorch::backends::aoti::slim::c10::complex<float>, ComplexFloat)   \
+  _(executorch::backends::aoti::slim::c10::complex<double>, ComplexDouble) \
+  _(bool, Bool)                                                            \
+  _(executorch::backends::aoti::slim::c10::BFloat16, BFloat16)             \
+  _(executorch::backends::aoti::slim::c10::Float8_e5m2, Float8_e5m2)       \
+  _(executorch::backends::aoti::slim::c10::Float8_e4m3fn, Float8_e4m3fn)
 
 // This macro controls many of our C++ APIs, including constructors
 // for Scalar as well as the data() and item() accessors on Tensor
-#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(_)                    \
-  _(uint8_t, Byte)                                                \
-  _(int8_t, Char)                                                 \
-  _(int16_t, Short)                                               \
-  _(int, Int)                                                     \
-  _(int64_t, Long)                                                \
-  _(standalone::c10::Half, Half)                                  \
-  _(float, Float)                                                 \
-  _(double, Double)                                               \
-  _(standalone::c10::complex<standalone::c10::Half>, ComplexHalf) \
-  _(standalone::c10::complex<float>, ComplexFloat)                \
-  _(standalone::c10::complex<double>, ComplexDouble)              \
-  _(bool, Bool)                                                   \
-  _(standalone::c10::BFloat16, BFloat16)                          \
-  _(standalone::c10::Float8_e5m2, Float8_e5m2)                    \
-  _(standalone::c10::Float8_e4m3fn, Float8_e4m3fn)                \
-  _(standalone::c10::Float8_e5m2fnuz, Float8_e5m2fnuz)            \
-  _(standalone::c10::Float8_e4m3fnuz, Float8_e4m3fnuz)            \
-  _(standalone::c10::Float8_e8m0fnu, Float8_e8m0fnu)
+#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(_)                               \
+  _(uint8_t, Byte)                                                           \
+  _(int8_t, Char)                                                            \
+  _(int16_t, Short)                                                          \
+  _(int, Int)                                                                \
+  _(int64_t, Long)                                                           \
+  _(executorch::backends::aoti::slim::c10::Half, Half)                       \
+  _(float, Float)                                                            \
+  _(double, Double)                                                          \
+  _(executorch::backends::aoti::slim::c10::complex<                          \
+        executorch::backends::aoti::slim::c10::Half>,                        \
+    ComplexHalf)                                                             \
+  _(executorch::backends::aoti::slim::c10::complex<float>, ComplexFloat)     \
+  _(executorch::backends::aoti::slim::c10::complex<double>, ComplexDouble)   \
+  _(bool, Bool)                                                              \
+  _(executorch::backends::aoti::slim::c10::BFloat16, BFloat16)               \
+  _(executorch::backends::aoti::slim::c10::Float8_e5m2, Float8_e5m2)         \
+  _(executorch::backends::aoti::slim::c10::Float8_e4m3fn, Float8_e4m3fn)     \
+  _(executorch::backends::aoti::slim::c10::Float8_e5m2fnuz, Float8_e5m2fnuz) \
+  _(executorch::backends::aoti::slim::c10::Float8_e4m3fnuz, Float8_e4m3fnuz) \
+  _(executorch::backends::aoti::slim::c10::Float8_e8m0fnu, Float8_e8m0fnu)
 
 enum class ScalarType : int8_t {
 #define DEFINE_ST_ENUM_VAL_(_1, n) n,
@@ -168,19 +179,20 @@ namespace impl {
 
 // These are used to map ScalarTypes to C++ types.
 
-template <standalone::c10::ScalarType N>
+template <executorch::backends::aoti::slim::c10::ScalarType N>
 struct ScalarTypeToCPPType;
 
 #define SPECIALIZE_ScalarTypeToCPPType(cpp_type, scalar_type)                \
   template <>                                                                \
-  struct ScalarTypeToCPPType<standalone::c10::ScalarType::scalar_type> {     \
+  struct ScalarTypeToCPPType<                                                \
+      executorch::backends::aoti::slim::c10::ScalarType::scalar_type> {      \
     using type = cpp_type;                                                   \
                                                                              \
     /* This is a workaround for the CUDA bug which prevents */               \
     /* ::detail::ScalarTypeToCType<T>::type being used directly due to */    \
     /* ambiguous reference which can't to be resolved. For some reason it */ \
-    /* can't pick between standalone::c10::detail and                        \
-     * standalone::c10::cuda::detail. */                                     \
+    /* can't pick between executorch::backends::aoti::slim::c10::detail and  \
+     * executorch::backends::aoti::slim::c10::cuda::detail. */               \
     /* For repro example, please see: */                                     \
     /* https://gist.github.com/izdeby/952ae7cf256ddb740a73776d39a7e7ba */    \
     /* TODO: remove once the bug is fixed. */                                \
@@ -191,7 +203,7 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_ScalarTypeToCPPType)
 
 #undef SPECIALIZE_ScalarTypeToCPPType
 
-template <standalone::c10::ScalarType N>
+template <executorch::backends::aoti::slim::c10::ScalarType N>
 using ScalarTypeToCPPTypeT = typename ScalarTypeToCPPType<N>::type;
 
 } // namespace impl
@@ -199,12 +211,13 @@ using ScalarTypeToCPPTypeT = typename ScalarTypeToCPPType<N>::type;
 template <typename T>
 struct CppTypeToScalarType;
 
-#define SPECIALIZE_CppTypeToScalarType(cpp_type, scalar_type) \
-  template <>                                                 \
-  struct CppTypeToScalarType<cpp_type>                        \
-      : std::integral_constant<                               \
-            standalone::c10::ScalarType,                      \
-            standalone::c10::ScalarType::scalar_type> {};
+#define SPECIALIZE_CppTypeToScalarType(cpp_type, scalar_type)                 \
+  template <>                                                                 \
+  struct CppTypeToScalarType<cpp_type>                                        \
+      : std::integral_constant<                                               \
+            executorch::backends::aoti::slim::c10::ScalarType,                \
+            executorch::backends::aoti::slim::c10::ScalarType::scalar_type> { \
+  };
 
 AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
 
@@ -233,106 +246,119 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
 // instead, new types should be added to use sites on a case-by-case basis.
 // We generally are not accepting new dtypes due to binary size concerns.
 
-#define AT_FORALL_SCALAR_TYPES_AND(SCALARTYPE, _)          \
-  _(uint8_t, Byte)                                         \
-  _(int8_t, Char)                                          \
-  _(int16_t, Short)                                        \
-  _(int, Int)                                              \
-  _(int64_t, Long)                                         \
-  _(float, Float)                                          \
-  _(double, Double)                                        \
-  _(decltype(standalone::c10::impl::ScalarTypeToCPPType<   \
-             standalone::c10::ScalarType::SCALARTYPE>::t), \
+#define AT_FORALL_SCALAR_TYPES_AND(SCALARTYPE, _)                              \
+  _(uint8_t, Byte)                                                             \
+  _(int8_t, Char)                                                              \
+  _(int16_t, Short)                                                            \
+  _(int, Int)                                                                  \
+  _(int64_t, Long)                                                             \
+  _(float, Float)                                                              \
+  _(double, Double)                                                            \
+  _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \
+             executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE>::  \
+                 t),                                                           \
     SCALARTYPE)
 
-#define AT_FORALL_SCALAR_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \
-  _(uint8_t, Byte)                                               \
-  _(int8_t, Char)                                                \
-  _(int16_t, Short)                                              \
-  _(int, Int)                                                    \
-  _(int64_t, Long)                                               \
-  _(float, Float)                                                \
-  _(double, Double)                                              \
-  _(decltype(standalone::c10::impl::ScalarTypeToCPPType<         \
-             standalone::c10::ScalarType::SCALARTYPE1>::t),      \
-    SCALARTYPE1)                                                 \
-  _(decltype(standalone::c10::impl::ScalarTypeToCPPType<         \
-             standalone::c10::ScalarType::SCALARTYPE2>::t),      \
+#define AT_FORALL_SCALAR_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _)               \
+  _(uint8_t, Byte)                                                             \
+  _(int8_t, Char)                                                              \
+  _(int16_t, Short)                                                            \
+  _(int, Int)                                                                  \
+  _(int64_t, Long)                                                             \
+  _(float, Float)                                                              \
+  _(double, Double)                                                            \
+  _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \
+             executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE1>:: \
+                 t),                                                           \
+    SCALARTYPE1)                                                               \
+  _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \
+             executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE2>:: \
+                 t),                                                           \
     SCALARTYPE2)
 
-#define AT_FORALL_SCALAR_TYPES_AND3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, _) \
-  _(uint8_t, Byte)                                                            \
-  _(int8_t, Char)                                                             \
-  _(int16_t, Short)                                                           \
-  _(int, Int)                                                                 \
-  _(int64_t, Long)                                                            \
-  _(float, Float)                                                             \
-  _(double, Double)                                                           \
-  _(decltype(standalone::c10::impl::ScalarTypeToCPPType<                      \
-             standalone::c10::ScalarType::SCALARTYPE1>::t),                   \
-    SCALARTYPE1)                                                              \
-  _(decltype(standalone::c10::impl::ScalarTypeToCPPType<                      \
-             standalone::c10::ScalarType::SCALARTYPE2>::t),                   \
-    SCALARTYPE2)                                                              \
-  _(decltype(standalone::c10::impl::ScalarTypeToCPPType<                      \
-             standalone::c10::ScalarType::SCALARTYPE3>::t),                   \
+#define AT_FORALL_SCALAR_TYPES_AND3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, _)  \
+  _(uint8_t, Byte)                                                             \
+  _(int8_t, Char)                                                              \
+  _(int16_t, Short)                                                            \
+  _(int, Int)                                                                  \
+  _(int64_t, Long)                                                             \
+  _(float, Float)                                                              \
+  _(double, Double)                                                            \
+  _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \
+             executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE1>:: \
+                 t),                                                           \
+    SCALARTYPE1)                                                               \
+  _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \
+             executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE2>:: \
+                 t),                                                           \
+    SCALARTYPE2)                                                               \
+  _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \
+             executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE3>:: \
+                 t),                                                           \
     SCALARTYPE3)
 
-#define AT_FORALL_SCALAR_TYPES_AND7(                        \
-    SCALARTYPE1,                                            \
-    SCALARTYPE2,                                            \
-    SCALARTYPE3,                                            \
-    SCALARTYPE4,                                            \
-    SCALARTYPE5,                                            \
-    SCALARTYPE6,                                            \
-    SCALARTYPE7,                                            \
-    _)                                                      \
-  _(uint8_t, Byte)                                          \
-  _(int8_t, Char)                                           \
-  _(int16_t, Short)                                         \
-  _(int, Int)                                               \
-  _(int64_t, Long)                                          \
-  _(float, Float)                                           \
-  _(double, Double)                                         \
-  _(decltype(standalone::c10::impl::ScalarTypeToCPPType<    \
-             standalone::c10::ScalarType::SCALARTYPE1>::t), \
-    SCALARTYPE1)                                            \
-  _(decltype(standalone::c10::impl::ScalarTypeToCPPType<    \
-             standalone::c10::ScalarType::SCALARTYPE2>::t), \
-    SCALARTYPE2)                                            \
-  _(decltype(standalone::c10::impl::ScalarTypeToCPPType<    \
-             standalone::c10::ScalarType::SCALARTYPE3>::t), \
-    SCALARTYPE3)                                            \
-  _(decltype(standalone::c10::impl::ScalarTypeToCPPType<    \
-             standalone::c10::ScalarType::SCALARTYPE4>::t), \
-    SCALARTYPE4)                                            \
-  _(decltype(standalone::c10::impl::ScalarTypeToCPPType<    \
-             standalone::c10::ScalarType::SCALARTYPE5>::t), \
-    SCALARTYPE5)                                            \
-  _(decltype(standalone::c10::impl::ScalarTypeToCPPType<    \
-             standalone::c10::ScalarType::SCALARTYPE6>::t), \
-    SCALARTYPE6)                                            \
-  _(decltype(standalone::c10::impl::ScalarTypeToCPPType<    \
-             standalone::c10::ScalarType::SCALARTYPE7>::t), \
+#define AT_FORALL_SCALAR_TYPES_AND7(                                           \
+    SCALARTYPE1,                                                               \
+    SCALARTYPE2,                                                               \
+    SCALARTYPE3,                                                               \
+    SCALARTYPE4,                                                               \
+    SCALARTYPE5,                                                               \
+    SCALARTYPE6,                                                               \
+    SCALARTYPE7,                                                               \
+    _)                                                                         \
+  _(uint8_t, Byte)                                                             \
+  _(int8_t, Char)                                                              \
+  _(int16_t, Short)                                                            \
+  _(int, Int)                                                                  \
+  _(int64_t, Long)                                                             \
+  _(float, Float)                                                              \
+  _(double, Double)                                                            \
+  _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \
+             executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE1>:: \
+                 t),                                                           \
+    SCALARTYPE1)                                                               \
+  _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \
+             executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE2>:: \
+                 t),                                                           \
+    SCALARTYPE2)                                                               \
+  _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \
+             executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE3>:: \
+                 t),                                                           \
+    SCALARTYPE3)                                                               \
+  _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \
+             executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE4>:: \
+                 t),                                                           \
+    SCALARTYPE4)                                                               \
+  _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \
+             executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE5>:: \
+                 t),                                                           \
+    SCALARTYPE5)                                                               \
+  _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \
+             executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE6>:: \
+                 t),                                                           \
+    SCALARTYPE6)                                                               \
+  _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \
+             executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE7>:: \
+                 t),                                                           \
     SCALARTYPE7)
 
-#define AT_FORALL_QINT_TYPES(_)          \
-  _(standalone::c10::qint8, QInt8)       \
-  _(standalone::c10::quint8, QUInt8)     \
-  _(standalone::c10::qint32, QInt32)     \
-  _(standalone::c10::quint4x2, QUInt4x2) \
-  _(standalone::c10::quint2x4, QUInt2x4)
+#define AT_FORALL_QINT_TYPES(_)                                \
+  _(executorch::backends::aoti::slim::c10::qint8, QInt8)       \
+  _(executorch::backends::aoti::slim::c10::quint8, QUInt8)     \
+  _(executorch::backends::aoti::slim::c10::qint32, QInt32)     \
+  _(executorch::backends::aoti::slim::c10::quint4x2, QUInt4x2) \
+  _(executorch::backends::aoti::slim::c10::quint2x4, QUInt2x4)
 
-#define AT_FORALL_FLOAT8_TYPES(_)                      \
-  _(standalone::c10::Float8_e5m2, Float8_e5m2)         \
-  _(standalone::c10::Float8_e4m3fn, Float8_e4m3fn)     \
-  _(standalone::c10::Float8_e5m2fnuz, Float8_e5m2fnuz) \
-  _(standalone::c10::Float8_e4m3fnuz, Float8_e4m3fnuz) \
-  _(standalone::c10::Float8_e8m0fnu, Float8_e8m0fnu)
+#define AT_FORALL_FLOAT8_TYPES(_)                                            \
+  _(executorch::backends::aoti::slim::c10::Float8_e5m2, Float8_e5m2)         \
+  _(executorch::backends::aoti::slim::c10::Float8_e4m3fn, Float8_e4m3fn)     \
+  _(executorch::backends::aoti::slim::c10::Float8_e5m2fnuz, Float8_e5m2fnuz) \
+  _(executorch::backends::aoti::slim::c10::Float8_e4m3fnuz, Float8_e4m3fnuz) \
+  _(executorch::backends::aoti::slim::c10::Float8_e8m0fnu, Float8_e8m0fnu)
 
-#define AT_FORALL_COMPLEX_TYPES(_)                 \
-  _(standalone::c10::complex<float>, ComplexFloat) \
-  _(standalone::c10::complex<double>, ComplexDouble)
+#define AT_FORALL_COMPLEX_TYPES(_)                                       \
+  _(executorch::backends::aoti::slim::c10::complex<float>, ComplexFloat) \
+  _(executorch::backends::aoti::slim::c10::complex<double>, ComplexDouble)
 
 #define DEFINE_CONSTANT(_, name) \
   constexpr ScalarType k##name = ScalarType::name;
@@ -450,10 +476,11 @@ inline ScalarType toUnderlying(ScalarType t) {
 }
 
 inline bool isSignedType(ScalarType t) {
-#define CASE_ISSIGNED(name)                                                   \
-  case ScalarType::name:                                                      \
-    return std::numeric_limits<::standalone::c10::impl::ScalarTypeToCPPTypeT< \
-        ScalarType::name>>::is_signed;
+#define CASE_ISSIGNED(name)                                                  \
+  case ScalarType::name:                                                     \
+    return std::numeric_limits<                                              \
+        ::executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPTypeT< \
+            ScalarType::name>>::is_signed;
 
   // TODO(#146647): If we expect to have numeric_limits for everything,
   // let's just have a big macro for the whole thing.
@@ -605,20 +632,21 @@ constexpr auto b1 = ScalarType::Bool;
 constexpr auto bf = ScalarType::BFloat16;
 constexpr auto ud = ScalarType::Undefined;
 
-constexpr auto index2dtype = array_of<standalone::c10::ScalarType>(
-    u1,
-    i1,
-    i2,
-    i4,
-    i8,
-    f2,
-    f4,
-    f8,
-    c2,
-    c4,
-    c8,
-    b1,
-    bf);
+constexpr auto index2dtype =
+    array_of<executorch::backends::aoti::slim::c10::ScalarType>(
+        u1,
+        i1,
+        i2,
+        i4,
+        i8,
+        f2,
+        f4,
+        f8,
+        c2,
+        c4,
+        c8,
+        b1,
+        bf);
 
 constexpr std::array<int64_t, static_cast<size_t>(ScalarType::NumOptions)>
 calculate_dtype2index() {
@@ -728,8 +756,8 @@ inline ScalarType promoteTypes(ScalarType a, ScalarType b) {
 
 inline std::ostream& operator<<(
     std::ostream& stream,
-    standalone::c10::ScalarType scalar_type) {
+    executorch::backends::aoti::slim::c10::ScalarType scalar_type) {
   return stream << toString(scalar_type);
 }
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/core/SizesAndStrides.h b/backends/aoti/slim/c10/core/SizesAndStrides.h
index aef0ddab171..0b9edaccde7 100644
--- a/backends/aoti/slim/c10/core/SizesAndStrides.h
+++ b/backends/aoti/slim/c10/core/SizesAndStrides.h
@@ -10,7 +10,7 @@
 
 #define STANDALONE_SIZES_AND_STRIDES_MAX_INLINE_SIZE 5
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 // Packed container for TensorImpl sizes and strides.
 // This design improves on the previous approach of using a pair of
@@ -399,4 +399,4 @@ class SizesAndStrides {
   };
 };
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/core/WrapDimMinimal.h b/backends/aoti/slim/c10/core/WrapDimMinimal.h
index 651421e6d89..68c80a4abc3 100644
--- a/backends/aoti/slim/c10/core/WrapDimMinimal.h
+++ b/backends/aoti/slim/c10/core/WrapDimMinimal.h
@@ -7,7 +7,7 @@
 
 // Different from the original implementation in c10, we don't need
 // to support SymInt here.
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 namespace detail {
 template <typename T>
 T maybe_wrap_dim_slow(T dim, T dim_post_expr, bool wrap_scalar);
@@ -25,7 +25,7 @@ T _maybe_wrap_dim(T dim, T dim_post_expr, bool wrap_scalar = true) {
     return dim;
   }
   // Check edge-cases out-of-line (wrapping scalars and out-of-bounds errors)
-  return standalone::c10::detail::maybe_wrap_dim_slow<T>(
+  return executorch::backends::aoti::slim::c10::detail::maybe_wrap_dim_slow<T>(
       std::move(dim), std::move(dim_post_expr), wrap_scalar);
 }
 
@@ -48,7 +48,7 @@ T maybe_wrap_dim_slow(T dim, T dim_post_expr, bool wrap_scalar) {
         "Dimension specified as ",
         dim,
         " but tensor has no dimensions");
-    return standalone::c10::maybe_wrap_dim(
+    return executorch::backends::aoti::slim::c10::maybe_wrap_dim(
         std::move(dim),
         /*dim_post_expr=*/1,
         /*wrap_scalar=*/false);
@@ -70,4 +70,4 @@ T maybe_wrap_dim_slow(T dim, T dim_post_expr, bool wrap_scalar) {
       false, "should never reach here as dim should be out-of-bounds");
 }
 } // namespace detail
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/util/Array.h b/backends/aoti/slim/c10/util/Array.h
index 39eabc830d1..d093d26c51a 100644
--- a/backends/aoti/slim/c10/util/Array.h
+++ b/backends/aoti/slim/c10/util/Array.h
@@ -3,7 +3,7 @@
 #include <array>
 #include <utility>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 // This helper function creates a constexpr std::array
 // From a compile time list of values, without requiring you to explicitly
@@ -15,4 +15,4 @@ inline constexpr auto array_of(T&&... t) -> std::array<V, sizeof...(T)> {
   return {{std::forward<T>(t)...}};
 }
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/util/ArrayRef.h b/backends/aoti/slim/c10/util/ArrayRef.h
index 4a09f7a9335..9c7c6cd781d 100644
--- a/backends/aoti/slim/c10/util/ArrayRef.h
+++ b/backends/aoti/slim/c10/util/ArrayRef.h
@@ -29,7 +29,7 @@
 #include <type_traits>
 #include <vector>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 /// ArrayRef - Represent a constant reference to an array (0 or more elements
 /// consecutively in memory), i.e. a start pointer and a length.  It allows
 /// various APIs to take consecutive elements easily and conveniently.
@@ -324,41 +324,49 @@ ArrayRef<T> makeArrayRef(const T (&Arr)[N]) {
 }
 
 // WARNING: Template instantiation will NOT be willing to do an implicit
-// conversions to get you to an standalone::c10::ArrayRef, which is why we
-// need so many overloads.
+// conversions to get you to an executorch::backends::aoti::slim::c10::ArrayRef,
+// which is why we need so many overloads.
 
 template <typename T>
 bool operator==(
-    standalone::c10::ArrayRef<T> a1,
-    standalone::c10::ArrayRef<T> a2) {
+    executorch::backends::aoti::slim::c10::ArrayRef<T> a1,
+    executorch::backends::aoti::slim::c10::ArrayRef<T> a2) {
   return a1.equals(a2);
 }
 
 template <typename T>
 bool operator!=(
-    standalone::c10::ArrayRef<T> a1,
-    standalone::c10::ArrayRef<T> a2) {
+    executorch::backends::aoti::slim::c10::ArrayRef<T> a1,
+    executorch::backends::aoti::slim::c10::ArrayRef<T> a2) {
   return !a1.equals(a2);
 }
 
 template <typename T>
-bool operator==(const std::vector<T>& a1, standalone::c10::ArrayRef<T> a2) {
-  return standalone::c10::ArrayRef<T>(a1).equals(a2);
+bool operator==(
+    const std::vector<T>& a1,
+    executorch::backends::aoti::slim::c10::ArrayRef<T> a2) {
+  return executorch::backends::aoti::slim::c10::ArrayRef<T>(a1).equals(a2);
 }
 
 template <typename T>
-bool operator!=(const std::vector<T>& a1, standalone::c10::ArrayRef<T> a2) {
-  return !standalone::c10::ArrayRef<T>(a1).equals(a2);
+bool operator!=(
+    const std::vector<T>& a1,
+    executorch::backends::aoti::slim::c10::ArrayRef<T> a2) {
+  return !executorch::backends::aoti::slim::c10::ArrayRef<T>(a1).equals(a2);
 }
 
 template <typename T>
-bool operator==(standalone::c10::ArrayRef<T> a1, const std::vector<T>& a2) {
-  return a1.equals(standalone::c10::ArrayRef<T>(a2));
+bool operator==(
+    executorch::backends::aoti::slim::c10::ArrayRef<T> a1,
+    const std::vector<T>& a2) {
+  return a1.equals(executorch::backends::aoti::slim::c10::ArrayRef<T>(a2));
 }
 
 template <typename T>
-bool operator!=(standalone::c10::ArrayRef<T> a1, const std::vector<T>& a2) {
-  return !a1.equals(standalone::c10::ArrayRef<T>(a2));
+bool operator!=(
+    executorch::backends::aoti::slim::c10::ArrayRef<T> a1,
+    const std::vector<T>& a2) {
+  return !a1.equals(executorch::backends::aoti::slim::c10::ArrayRef<T>(a2));
 }
 
 using IntArrayRef = ArrayRef<int64_t>;
@@ -368,4 +376,4 @@ using IntList
                  "semantics obvious. Use IntArrayRef instead!")]] =
         ArrayRef<int64_t>;
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/util/BFloat16-inl.h b/backends/aoti/slim/c10/util/BFloat16-inl.h
index 4608d9a6c54..5c41d4aaad0 100644
--- a/backends/aoti/slim/c10/util/BFloat16-inl.h
+++ b/backends/aoti/slim/c10/util/BFloat16-inl.h
@@ -16,7 +16,7 @@ STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
 #include <sycl/sycl.hpp> // for SYCL 2020
 #endif
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 /// Constructors
 inline STANDALONE_HOST_DEVICE BFloat16::BFloat16(float value)
@@ -26,7 +26,8 @@ inline STANDALONE_HOST_DEVICE BFloat16::BFloat16(float value)
       x(__bfloat16_as_ushort(__float2bfloat16(value)))
 #elif defined(__SYCL_DEVICE_ONLY__) && \
     defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
-      x(standalone::c10::bit_cast<uint16_t>(sycl::ext::oneapi::bfloat16(value)))
+      x(executorch::backends::aoti::slim::c10::bit_cast<uint16_t>(
+          sycl::ext::oneapi::bfloat16(value)))
 #else
       // RNE by default
       x(detail::round_to_nearest_even(value))
@@ -289,12 +290,12 @@ inline STANDALONE_HOST_DEVICE bool operator<(BFloat16& lhs, BFloat16& rhs) {
   return float(lhs) < float(rhs);
 }
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 namespace std {
 
 template <>
-class numeric_limits<standalone::c10::BFloat16> {
+class numeric_limits<executorch::backends::aoti::slim::c10::BFloat16> {
  public:
   static constexpr bool is_signed = true;
   static constexpr bool is_specialized = true;
@@ -322,41 +323,44 @@ class numeric_limits<standalone::c10::BFloat16> {
   static constexpr auto tinyness_before =
       numeric_limits<float>::tinyness_before;
 
-  static constexpr standalone::c10::BFloat16 min() {
-    return standalone::c10::BFloat16(
-        0x0080, standalone::c10::BFloat16::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::BFloat16 min() {
+    return executorch::backends::aoti::slim::c10::BFloat16(
+        0x0080, executorch::backends::aoti::slim::c10::BFloat16::from_bits());
   }
-  static constexpr standalone::c10::BFloat16 lowest() {
-    return standalone::c10::BFloat16(
-        0xFF7F, standalone::c10::BFloat16::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::BFloat16 lowest() {
+    return executorch::backends::aoti::slim::c10::BFloat16(
+        0xFF7F, executorch::backends::aoti::slim::c10::BFloat16::from_bits());
   }
-  static constexpr standalone::c10::BFloat16 max() {
-    return standalone::c10::BFloat16(
-        0x7F7F, standalone::c10::BFloat16::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::BFloat16 max() {
+    return executorch::backends::aoti::slim::c10::BFloat16(
+        0x7F7F, executorch::backends::aoti::slim::c10::BFloat16::from_bits());
   }
-  static constexpr standalone::c10::BFloat16 epsilon() {
-    return standalone::c10::BFloat16(
-        0x3C00, standalone::c10::BFloat16::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::BFloat16 epsilon() {
+    return executorch::backends::aoti::slim::c10::BFloat16(
+        0x3C00, executorch::backends::aoti::slim::c10::BFloat16::from_bits());
   }
-  static constexpr standalone::c10::BFloat16 round_error() {
-    return standalone::c10::BFloat16(
-        0x3F00, standalone::c10::BFloat16::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::BFloat16
+  round_error() {
+    return executorch::backends::aoti::slim::c10::BFloat16(
+        0x3F00, executorch::backends::aoti::slim::c10::BFloat16::from_bits());
   }
-  static constexpr standalone::c10::BFloat16 infinity() {
-    return standalone::c10::BFloat16(
-        0x7F80, standalone::c10::BFloat16::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::BFloat16 infinity() {
+    return executorch::backends::aoti::slim::c10::BFloat16(
+        0x7F80, executorch::backends::aoti::slim::c10::BFloat16::from_bits());
   }
-  static constexpr standalone::c10::BFloat16 quiet_NaN() {
-    return standalone::c10::BFloat16(
-        0x7FC0, standalone::c10::BFloat16::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::BFloat16 quiet_NaN() {
+    return executorch::backends::aoti::slim::c10::BFloat16(
+        0x7FC0, executorch::backends::aoti::slim::c10::BFloat16::from_bits());
   }
-  static constexpr standalone::c10::BFloat16 signaling_NaN() {
-    return standalone::c10::BFloat16(
-        0x7F80, standalone::c10::BFloat16::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::BFloat16
+  signaling_NaN() {
+    return executorch::backends::aoti::slim::c10::BFloat16(
+        0x7F80, executorch::backends::aoti::slim::c10::BFloat16::from_bits());
   }
-  static constexpr standalone::c10::BFloat16 denorm_min() {
-    return standalone::c10::BFloat16(
-        0x0001, standalone::c10::BFloat16::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::BFloat16
+  denorm_min() {
+    return executorch::backends::aoti::slim::c10::BFloat16(
+        0x0001, executorch::backends::aoti::slim::c10::BFloat16::from_bits());
   }
 };
 
diff --git a/backends/aoti/slim/c10/util/BFloat16-math.h b/backends/aoti/slim/c10/util/BFloat16-math.h
index f036f309e26..ad67d81fa23 100644
--- a/backends/aoti/slim/c10/util/BFloat16-math.h
+++ b/backends/aoti/slim/c10/util/BFloat16-math.h
@@ -8,243 +8,276 @@ STANDALONE_CLANG_DIAGNOSTIC_PUSH()
 STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion")
 #endif
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 template <typename T>
 struct is_reduced_floating_point
     : std::integral_constant<
           bool,
-          std::is_same_v<T, standalone::c10::Half> ||
-              std::is_same_v<T, standalone::c10::BFloat16>> {};
+          std::is_same_v<T, executorch::backends::aoti::slim::c10::Half> ||
+              std::is_same_v<
+                  T,
+                  executorch::backends::aoti::slim::c10::BFloat16>> {};
 
 template <typename T>
 constexpr bool is_reduced_floating_point_v =
     is_reduced_floating_point<T>::value;
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 namespace std {
 
 #if !defined(FBCODE_CAFFE2) && !defined(STANDALONE_NODEPRECATED)
-using standalone::c10::is_reduced_floating_point;
-using standalone::c10::is_reduced_floating_point_v;
+using executorch::backends::aoti::slim::c10::is_reduced_floating_point;
+using executorch::backends::aoti::slim::c10::is_reduced_floating_point_v;
 #endif // !defined(FBCODE_CAFFE2) && !defined(STANDALONE_NODEPRECATED)
 
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T acos(T a) {
   return std::acos(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T asin(T a) {
   return std::asin(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T atan(T a) {
   return std::atan(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T atanh(T a) {
   return std::atanh(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T erf(T a) {
   return std::erf(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T erfc(T a) {
   return std::erfc(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T exp(T a) {
   return std::exp(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T expm1(T a) {
   return std::expm1(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline bool isfinite(T a) {
   return std::isfinite(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T log(T a) {
   return std::log(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T log10(T a) {
   return std::log10(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T log1p(T a) {
   return std::log1p(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T log2(T a) {
   return std::log2(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T ceil(T a) {
   return std::ceil(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T cos(T a) {
   return std::cos(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T floor(T a) {
   return std::floor(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T nearbyint(T a) {
   return std::nearbyint(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T sin(T a) {
   return std::sin(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T tan(T a) {
   return std::tan(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T sinh(T a) {
   return std::sinh(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T cosh(T a) {
   return std::cosh(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T tanh(T a) {
   return std::tanh(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T trunc(T a) {
   return std::trunc(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T lgamma(T a) {
   return std::lgamma(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T sqrt(T a) {
   return std::sqrt(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T rsqrt(T a) {
   return 1.0 / std::sqrt(float(a));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T abs(T a) {
   return std::abs(float(a));
 }
 #if defined(_MSC_VER) && defined(__CUDACC__)
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T pow(T a, double b) {
   return std::pow(float(a), float(b));
 }
 #else
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T pow(T a, double b) {
   return std::pow(float(a), b);
 }
 #endif
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T pow(T a, T b) {
   return std::pow(float(a), float(b));
 }
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 inline T fmod(T a, T b) {
   return std::fmod(float(a), float(b));
 }
@@ -277,8 +310,9 @@ inline T fmod(T a, T b) {
  */
 template <
     typename T,
-    typename std::
-        enable_if_t<standalone::c10::is_reduced_floating_point_v<T>, int> = 0>
+    typename std::enable_if_t<
+        executorch::backends::aoti::slim::c10::is_reduced_floating_point_v<T>,
+        int> = 0>
 STANDALONE_HOST_DEVICE inline T nextafter(T from, T to) {
   // Reference:
   // https://git.musl-libc.org/cgit/musl/tree/src/math/nextafter.c
diff --git a/backends/aoti/slim/c10/util/BFloat16.h b/backends/aoti/slim/c10/util/BFloat16.h
index ed6d07f53d0..d1b2a5baeb2 100644
--- a/backends/aoti/slim/c10/util/BFloat16.h
+++ b/backends/aoti/slim/c10/util/BFloat16.h
@@ -20,7 +20,7 @@
 #include <sycl/sycl.hpp> // for SYCL 2020
 #endif
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 namespace detail {
 inline STANDALONE_HOST_DEVICE float f32_from_bits(uint16_t src) {
@@ -118,6 +118,6 @@ inline std::ostream& operator<<(std::ostream& out, const BFloat16& value) {
   return out;
 }
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 #include <executorch/backends/aoti/slim/c10/util/BFloat16-inl.h> // IWYU pragma: keep
diff --git a/backends/aoti/slim/c10/util/Exception.h b/backends/aoti/slim/c10/util/Exception.h
index 6ab2bd8aae6..f83bf3f074a 100644
--- a/backends/aoti/slim/c10/util/Exception.h
+++ b/backends/aoti/slim/c10/util/Exception.h
@@ -6,8 +6,8 @@
 #include <string>
 
 // In the standalone version, STANDALONE_CHECK throws std::runtime_error
-// instead of standalone::c10::Error.
-namespace standalone::c10::detail {
+// instead of executorch::backends::aoti::slim::c10::Error.
+namespace executorch::backends::aoti::slim::c10::detail {
 template <typename... Args>
 std::string torchCheckMsgImpl(const char* /*msg*/, const Args&... args) {
   // This is similar to the one in c10/util/Exception.h, but does
@@ -25,14 +25,14 @@ inline const char* torchCheckMsgImpl(const char* msg) {
 inline const char* torchCheckMsgImpl(const char* /*msg*/, const char* args) {
   return args;
 }
-} // namespace standalone::c10::detail
+} // namespace executorch::backends::aoti::slim::c10::detail
 
-#define STANDALONE_CHECK_MSG(cond, type, ...)              \
-  (::standalone::c10::detail::torchCheckMsgImpl(           \
-      "Expected " #cond                                    \
-      " to be true, but got false.  "                      \
-      "(Could this error message be improved?  If so, "    \
-      "please report an enhancement request to PyTorch.)", \
+#define STANDALONE_CHECK_MSG(cond, type, ...)                          \
+  (::executorch::backends::aoti::slim::c10::detail::torchCheckMsgImpl( \
+      "Expected " #cond                                                \
+      " to be true, but got false.  "                                  \
+      "(Could this error message be improved?  If so, "                \
+      "please report an enhancement request to PyTorch.)",             \
       ##__VA_ARGS__))
 #define STANDALONE_CHECK(cond, ...)                \
   if (STANDALONE_UNLIKELY_OR_CONST(!(cond))) {     \
@@ -63,8 +63,9 @@ inline const char* torchCheckMsgImpl(const char* /*msg*/, const char* args) {
         ##__VA_ARGS__));                           \
   }
 
-#define WARNING_MESSAGE_STRING(...) \
-  ::standalone::c10::detail::torchCheckMsgImpl(__VA_ARGS__)
+#define WARNING_MESSAGE_STRING(...)                                   \
+  ::executorch::backends::aoti::slim::c10::detail::torchCheckMsgImpl( \
+      __VA_ARGS__)
 
 #ifdef DISABLE_WARN
 #define _STANDALONE_WARN_WITH(...) ((void)0);
diff --git a/backends/aoti/slim/c10/util/Float4_e2m1fn_x2.h b/backends/aoti/slim/c10/util/Float4_e2m1fn_x2.h
index 600e281b583..182163b9ca2 100644
--- a/backends/aoti/slim/c10/util/Float4_e2m1fn_x2.h
+++ b/backends/aoti/slim/c10/util/Float4_e2m1fn_x2.h
@@ -17,7 +17,7 @@
 ///   sign/exponent/mantissa     | seem : seem
 ///
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 struct alignas(1) Float4_e2m1fn_x2 {
   uint8_t val_;
@@ -25,4 +25,4 @@ struct alignas(1) Float4_e2m1fn_x2 {
   STANDALONE_HOST_DEVICE explicit Float4_e2m1fn_x2(uint8_t val) : val_(val) {}
 };
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/util/Float8_e4m3fn-inl.h b/backends/aoti/slim/c10/util/Float8_e4m3fn-inl.h
index cc31b82e699..a0cb1db2888 100644
--- a/backends/aoti/slim/c10/util/Float8_e4m3fn-inl.h
+++ b/backends/aoti/slim/c10/util/Float8_e4m3fn-inl.h
@@ -9,7 +9,7 @@ STANDALONE_CLANG_DIAGNOSTIC_PUSH()
 STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
 #endif
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 /// Constructors
 
@@ -229,14 +229,15 @@ operator/(int64_t a, Float8_e4m3fn b) {
 }
 
 /// NOTE: we do not define comparisons directly and instead rely on the implicit
-/// conversion from standalone::c10::Float8_e4m3fn to float.
+/// conversion from executorch::backends::aoti::slim::c10::Float8_e4m3fn to
+/// float.
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 namespace std {
 
 template <>
-class numeric_limits<standalone::c10::Float8_e4m3fn> {
+class numeric_limits<executorch::backends::aoti::slim::c10::Float8_e4m3fn> {
  public:
   static constexpr bool is_specialized = true;
   static constexpr bool is_signed = true;
@@ -262,33 +263,45 @@ class numeric_limits<standalone::c10::Float8_e4m3fn> {
   static constexpr auto traps = numeric_limits<float>::traps;
   static constexpr auto tinyness_before = false;
 
-  static constexpr standalone::c10::Float8_e4m3fn min() {
-    return standalone::c10::Float8_e4m3fn(
-        0x08, standalone::c10::Float8_e4m3fn::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fn min() {
+    return executorch::backends::aoti::slim::c10::Float8_e4m3fn(
+        0x08,
+        executorch::backends::aoti::slim::c10::Float8_e4m3fn::from_bits());
   }
-  static constexpr standalone::c10::Float8_e4m3fn lowest() {
-    return standalone::c10::Float8_e4m3fn(
-        0xFE, standalone::c10::Float8_e4m3fn::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fn
+  lowest() {
+    return executorch::backends::aoti::slim::c10::Float8_e4m3fn(
+        0xFE,
+        executorch::backends::aoti::slim::c10::Float8_e4m3fn::from_bits());
   }
-  static constexpr standalone::c10::Float8_e4m3fn max() {
-    return standalone::c10::Float8_e4m3fn(
-        0x7E, standalone::c10::Float8_e4m3fn::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fn max() {
+    return executorch::backends::aoti::slim::c10::Float8_e4m3fn(
+        0x7E,
+        executorch::backends::aoti::slim::c10::Float8_e4m3fn::from_bits());
   }
-  static constexpr standalone::c10::Float8_e4m3fn epsilon() {
-    return standalone::c10::Float8_e4m3fn(
-        0x20, standalone::c10::Float8_e4m3fn::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fn
+  epsilon() {
+    return executorch::backends::aoti::slim::c10::Float8_e4m3fn(
+        0x20,
+        executorch::backends::aoti::slim::c10::Float8_e4m3fn::from_bits());
   }
-  static constexpr standalone::c10::Float8_e4m3fn round_error() {
-    return standalone::c10::Float8_e4m3fn(
-        0x30, standalone::c10::Float8_e4m3fn::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fn
+  round_error() {
+    return executorch::backends::aoti::slim::c10::Float8_e4m3fn(
+        0x30,
+        executorch::backends::aoti::slim::c10::Float8_e4m3fn::from_bits());
   }
-  static constexpr standalone::c10::Float8_e4m3fn quiet_NaN() {
-    return standalone::c10::Float8_e4m3fn(
-        0x7F, standalone::c10::Float8_e4m3fn::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fn
+  quiet_NaN() {
+    return executorch::backends::aoti::slim::c10::Float8_e4m3fn(
+        0x7F,
+        executorch::backends::aoti::slim::c10::Float8_e4m3fn::from_bits());
   }
-  static constexpr standalone::c10::Float8_e4m3fn denorm_min() {
-    return standalone::c10::Float8_e4m3fn(
-        0x01, standalone::c10::Float8_e4m3fn::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fn
+  denorm_min() {
+    return executorch::backends::aoti::slim::c10::Float8_e4m3fn(
+        0x01,
+        executorch::backends::aoti::slim::c10::Float8_e4m3fn::from_bits());
   }
 };
 
diff --git a/backends/aoti/slim/c10/util/Float8_e4m3fn.h b/backends/aoti/slim/c10/util/Float8_e4m3fn.h
index 320a677cbbb..22118007289 100644
--- a/backends/aoti/slim/c10/util/Float8_e4m3fn.h
+++ b/backends/aoti/slim/c10/util/Float8_e4m3fn.h
@@ -32,7 +32,7 @@
 #include <climits>
 #include <iostream>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 namespace detail {
 
@@ -233,6 +233,6 @@ inline std::ostream& operator<<(std::ostream& out, const Float8_e4m3fn& value) {
   return out;
 }
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 #include <executorch/backends/aoti/slim/c10/util/Float8_e4m3fn-inl.h> // IWYU pragma: keep
diff --git a/backends/aoti/slim/c10/util/Float8_e4m3fnuz-inl.h b/backends/aoti/slim/c10/util/Float8_e4m3fnuz-inl.h
index 55a6ce73972..51f7c017504 100644
--- a/backends/aoti/slim/c10/util/Float8_e4m3fnuz-inl.h
+++ b/backends/aoti/slim/c10/util/Float8_e4m3fnuz-inl.h
@@ -10,7 +10,7 @@ STANDALONE_CLANG_DIAGNOSTIC_PUSH()
 STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
 #endif
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 /// Constructors
 
@@ -239,14 +239,15 @@ operator/(int64_t a, Float8_e4m3fnuz b) {
 }
 
 /// NOTE: we do not define comparisons directly and instead rely on the implicit
-/// conversion from standalone::c10::Float8_e4m3fnuz to float.
+/// conversion from executorch::backends::aoti::slim::c10::Float8_e4m3fnuz to
+/// float.
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 namespace std {
 
 template <>
-class numeric_limits<standalone::c10::Float8_e4m3fnuz> {
+class numeric_limits<executorch::backends::aoti::slim::c10::Float8_e4m3fnuz> {
  public:
   static constexpr bool is_specialized = true;
   static constexpr bool is_signed = true;
@@ -272,38 +273,54 @@ class numeric_limits<standalone::c10::Float8_e4m3fnuz> {
   static constexpr auto traps = numeric_limits<float>::traps;
   static constexpr auto tinyness_before = false;
 
-  static constexpr standalone::c10::Float8_e4m3fnuz min() {
-    return standalone::c10::Float8_e4m3fnuz(
-        0x08, standalone::c10::Float8_e4m3fnuz::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fnuz
+  min() {
+    return executorch::backends::aoti::slim::c10::Float8_e4m3fnuz(
+        0x08,
+        executorch::backends::aoti::slim::c10::Float8_e4m3fnuz::from_bits());
   }
-  static constexpr standalone::c10::Float8_e4m3fnuz lowest() {
-    return standalone::c10::Float8_e4m3fnuz(
-        0xFF, standalone::c10::Float8_e4m3fnuz::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fnuz
+  lowest() {
+    return executorch::backends::aoti::slim::c10::Float8_e4m3fnuz(
+        0xFF,
+        executorch::backends::aoti::slim::c10::Float8_e4m3fnuz::from_bits());
   }
-  static constexpr standalone::c10::Float8_e4m3fnuz max() {
-    return standalone::c10::Float8_e4m3fnuz(
-        0x7F, standalone::c10::Float8_e4m3fnuz::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fnuz
+  max() {
+    return executorch::backends::aoti::slim::c10::Float8_e4m3fnuz(
+        0x7F,
+        executorch::backends::aoti::slim::c10::Float8_e4m3fnuz::from_bits());
   }
-  static constexpr standalone::c10::Float8_e4m3fnuz epsilon() {
-    return standalone::c10::Float8_e4m3fnuz(
-        0x28, standalone::c10::Float8_e4m3fnuz::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fnuz
+  epsilon() {
+    return executorch::backends::aoti::slim::c10::Float8_e4m3fnuz(
+        0x28,
+        executorch::backends::aoti::slim::c10::Float8_e4m3fnuz::from_bits());
   }
-  static constexpr standalone::c10::Float8_e4m3fnuz round_error() {
-    return standalone::c10::Float8_e4m3fnuz(
-        0x38, standalone::c10::Float8_e4m3fnuz::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fnuz
+  round_error() {
+    return executorch::backends::aoti::slim::c10::Float8_e4m3fnuz(
+        0x38,
+        executorch::backends::aoti::slim::c10::Float8_e4m3fnuz::from_bits());
   }
-  static constexpr standalone::c10::Float8_e4m3fnuz infinity() {
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fnuz
+  infinity() {
     // NaN (no infinities)
-    return standalone::c10::Float8_e4m3fnuz(
-        0x80, standalone::c10::Float8_e4m3fnuz::from_bits());
+    return executorch::backends::aoti::slim::c10::Float8_e4m3fnuz(
+        0x80,
+        executorch::backends::aoti::slim::c10::Float8_e4m3fnuz::from_bits());
   }
-  static constexpr standalone::c10::Float8_e4m3fnuz quiet_NaN() {
-    return standalone::c10::Float8_e4m3fnuz(
-        0x80, standalone::c10::Float8_e4m3fnuz::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fnuz
+  quiet_NaN() {
+    return executorch::backends::aoti::slim::c10::Float8_e4m3fnuz(
+        0x80,
+        executorch::backends::aoti::slim::c10::Float8_e4m3fnuz::from_bits());
   }
-  static constexpr standalone::c10::Float8_e4m3fnuz denorm_min() {
-    return standalone::c10::Float8_e4m3fnuz(
-        0x01, standalone::c10::Float8_e4m3fnuz::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fnuz
+  denorm_min() {
+    return executorch::backends::aoti::slim::c10::Float8_e4m3fnuz(
+        0x01,
+        executorch::backends::aoti::slim::c10::Float8_e4m3fnuz::from_bits());
   }
 };
 
diff --git a/backends/aoti/slim/c10/util/Float8_e4m3fnuz.h b/backends/aoti/slim/c10/util/Float8_e4m3fnuz.h
index ff3c050f018..b9c8ae582f4 100644
--- a/backends/aoti/slim/c10/util/Float8_e4m3fnuz.h
+++ b/backends/aoti/slim/c10/util/Float8_e4m3fnuz.h
@@ -31,7 +31,7 @@
 #include <iosfwd>
 #include <ostream>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 namespace detail {
 
@@ -133,6 +133,6 @@ inline std::ostream& operator<<(
   return out;
 }
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 #include <executorch/backends/aoti/slim/c10/util/Float8_e4m3fnuz-inl.h> // IWYU pragma: keep
diff --git a/backends/aoti/slim/c10/util/Float8_e5m2-inl.h b/backends/aoti/slim/c10/util/Float8_e5m2-inl.h
index c8e90a8aa0d..bdc80613015 100644
--- a/backends/aoti/slim/c10/util/Float8_e5m2-inl.h
+++ b/backends/aoti/slim/c10/util/Float8_e5m2-inl.h
@@ -13,7 +13,7 @@ STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
 #define MAN_WIDTH_FP8 2
 #define EXP_BIAS_FP8 15
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 /// Constructors
 
@@ -229,14 +229,14 @@ inline STANDALONE_HOST_DEVICE Float8_e5m2 operator/(int64_t a, Float8_e5m2 b) {
 }
 
 /// NOTE: we do not define comparisons directly and instead rely on the implicit
-/// conversion from standalone::c10::Float8_e5m2 to float.
+/// conversion from executorch::backends::aoti::slim::c10::Float8_e5m2 to float.
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 namespace std {
 
 template <>
-class numeric_limits<standalone::c10::Float8_e5m2> {
+class numeric_limits<executorch::backends::aoti::slim::c10::Float8_e5m2> {
  public:
   static constexpr bool is_signed = true;
   static constexpr bool is_integer = false;
@@ -263,37 +263,42 @@ class numeric_limits<standalone::c10::Float8_e5m2> {
   static constexpr auto tinyness_before =
       numeric_limits<float>::tinyness_before;
 
-  static constexpr standalone::c10::Float8_e5m2 min() {
-    return standalone::c10::Float8_e5m2(
-        0x4, standalone::c10::Float8_e5m2::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2 min() {
+    return executorch::backends::aoti::slim::c10::Float8_e5m2(
+        0x4, executorch::backends::aoti::slim::c10::Float8_e5m2::from_bits());
   }
-  static constexpr standalone::c10::Float8_e5m2 max() {
-    return standalone::c10::Float8_e5m2(
-        0x7B, standalone::c10::Float8_e5m2::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2 max() {
+    return executorch::backends::aoti::slim::c10::Float8_e5m2(
+        0x7B, executorch::backends::aoti::slim::c10::Float8_e5m2::from_bits());
   }
-  static constexpr standalone::c10::Float8_e5m2 lowest() {
-    return standalone::c10::Float8_e5m2(
-        0xFB, standalone::c10::Float8_e5m2::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2 lowest() {
+    return executorch::backends::aoti::slim::c10::Float8_e5m2(
+        0xFB, executorch::backends::aoti::slim::c10::Float8_e5m2::from_bits());
   }
-  static constexpr standalone::c10::Float8_e5m2 epsilon() {
-    return standalone::c10::Float8_e5m2(
-        0x34, standalone::c10::Float8_e5m2::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2
+  epsilon() {
+    return executorch::backends::aoti::slim::c10::Float8_e5m2(
+        0x34, executorch::backends::aoti::slim::c10::Float8_e5m2::from_bits());
   }
-  static constexpr standalone::c10::Float8_e5m2 round_error() {
-    return standalone::c10::Float8_e5m2(
-        0x38, standalone::c10::Float8_e5m2::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2
+  round_error() {
+    return executorch::backends::aoti::slim::c10::Float8_e5m2(
+        0x38, executorch::backends::aoti::slim::c10::Float8_e5m2::from_bits());
   }
-  static constexpr standalone::c10::Float8_e5m2 infinity() {
-    return standalone::c10::Float8_e5m2(
-        0x7C, standalone::c10::Float8_e5m2::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2
+  infinity() {
+    return executorch::backends::aoti::slim::c10::Float8_e5m2(
+        0x7C, executorch::backends::aoti::slim::c10::Float8_e5m2::from_bits());
   }
-  static constexpr standalone::c10::Float8_e5m2 quiet_NaN() {
-    return standalone::c10::Float8_e5m2(
-        0x7F, standalone::c10::Float8_e5m2::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2
+  quiet_NaN() {
+    return executorch::backends::aoti::slim::c10::Float8_e5m2(
+        0x7F, executorch::backends::aoti::slim::c10::Float8_e5m2::from_bits());
   }
-  static constexpr standalone::c10::Float8_e5m2 denorm_min() {
-    return standalone::c10::Float8_e5m2(
-        0x01, standalone::c10::Float8_e5m2::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2
+  denorm_min() {
+    return executorch::backends::aoti::slim::c10::Float8_e5m2(
+        0x01, executorch::backends::aoti::slim::c10::Float8_e5m2::from_bits());
   }
 };
 
diff --git a/backends/aoti/slim/c10/util/Float8_e5m2.h b/backends/aoti/slim/c10/util/Float8_e5m2.h
index 88d1aab0525..6e9fa9b5aed 100644
--- a/backends/aoti/slim/c10/util/Float8_e5m2.h
+++ b/backends/aoti/slim/c10/util/Float8_e5m2.h
@@ -16,7 +16,7 @@
 
 #include <executorch/backends/aoti/slim/c10/util/Half.h>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 namespace detail {
 
@@ -142,6 +142,6 @@ inline std::ostream& operator<<(std::ostream& out, const Float8_e5m2& value) {
   return out;
 }
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 #include <executorch/backends/aoti/slim/c10/util/Float8_e5m2-inl.h> // IWYU pragma: keep
diff --git a/backends/aoti/slim/c10/util/Float8_e5m2fnuz-inl.h b/backends/aoti/slim/c10/util/Float8_e5m2fnuz-inl.h
index d2ccac329af..ca46726424b 100644
--- a/backends/aoti/slim/c10/util/Float8_e5m2fnuz-inl.h
+++ b/backends/aoti/slim/c10/util/Float8_e5m2fnuz-inl.h
@@ -10,7 +10,7 @@ STANDALONE_CLANG_DIAGNOSTIC_PUSH()
 STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
 #endif
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 /// Constructors
 
@@ -243,14 +243,15 @@ operator/(int64_t a, Float8_e5m2fnuz b) {
 }
 
 /// NOTE: we do not define comparisons directly and instead rely on the implicit
-/// conversion from standalone::c10::Float8_e5m2fnuz to float.
+/// conversion from executorch::backends::aoti::slim::c10::Float8_e5m2fnuz to
+/// float.
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 namespace std {
 
 template <>
-class numeric_limits<standalone::c10::Float8_e5m2fnuz> {
+class numeric_limits<executorch::backends::aoti::slim::c10::Float8_e5m2fnuz> {
  public:
   static constexpr bool is_signed = true;
   static constexpr bool is_integer = false;
@@ -277,39 +278,55 @@ class numeric_limits<standalone::c10::Float8_e5m2fnuz> {
   static constexpr auto tinyness_before =
       numeric_limits<float>::tinyness_before;
 
-  static constexpr standalone::c10::Float8_e5m2fnuz min() {
-    return standalone::c10::Float8_e5m2fnuz(
-        0x04, standalone::c10::Float8_e5m2fnuz::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2fnuz
+  min() {
+    return executorch::backends::aoti::slim::c10::Float8_e5m2fnuz(
+        0x04,
+        executorch::backends::aoti::slim::c10::Float8_e5m2fnuz::from_bits());
   }
-  static constexpr standalone::c10::Float8_e5m2fnuz max() {
-    return standalone::c10::Float8_e5m2fnuz(
-        0x7F, standalone::c10::Float8_e5m2fnuz::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2fnuz
+  max() {
+    return executorch::backends::aoti::slim::c10::Float8_e5m2fnuz(
+        0x7F,
+        executorch::backends::aoti::slim::c10::Float8_e5m2fnuz::from_bits());
   }
-  static constexpr standalone::c10::Float8_e5m2fnuz lowest() {
-    return standalone::c10::Float8_e5m2fnuz(
-        0xFF, standalone::c10::Float8_e5m2fnuz::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2fnuz
+  lowest() {
+    return executorch::backends::aoti::slim::c10::Float8_e5m2fnuz(
+        0xFF,
+        executorch::backends::aoti::slim::c10::Float8_e5m2fnuz::from_bits());
   }
-  static constexpr standalone::c10::Float8_e5m2fnuz epsilon() {
-    return standalone::c10::Float8_e5m2fnuz(
-        0x34, standalone::c10::Float8_e5m2fnuz::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2fnuz
+  epsilon() {
+    return executorch::backends::aoti::slim::c10::Float8_e5m2fnuz(
+        0x34,
+        executorch::backends::aoti::slim::c10::Float8_e5m2fnuz::from_bits());
   }
-  static constexpr standalone::c10::Float8_e5m2fnuz round_error() {
-    return standalone::c10::Float8_e5m2fnuz(
-        0x38, standalone::c10::Float8_e5m2fnuz::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2fnuz
+  round_error() {
+    return executorch::backends::aoti::slim::c10::Float8_e5m2fnuz(
+        0x38,
+        executorch::backends::aoti::slim::c10::Float8_e5m2fnuz::from_bits());
   }
-  static constexpr standalone::c10::Float8_e5m2fnuz infinity() {
-    return standalone::c10::Float8_e5m2fnuz(
-        0x80, standalone::c10::Float8_e5m2fnuz::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2fnuz
+  infinity() {
+    return executorch::backends::aoti::slim::c10::Float8_e5m2fnuz(
+        0x80,
+        executorch::backends::aoti::slim::c10::Float8_e5m2fnuz::from_bits());
   }
   // TODO(future): we are mapping neg_zero to both inf and NaN, this is
   // surprising and we should figure out what to do about it.
-  static constexpr standalone::c10::Float8_e5m2fnuz quiet_NaN() {
-    return standalone::c10::Float8_e5m2fnuz(
-        0x80, standalone::c10::Float8_e5m2fnuz::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2fnuz
+  quiet_NaN() {
+    return executorch::backends::aoti::slim::c10::Float8_e5m2fnuz(
+        0x80,
+        executorch::backends::aoti::slim::c10::Float8_e5m2fnuz::from_bits());
   }
-  static constexpr standalone::c10::Float8_e5m2fnuz denorm_min() {
-    return standalone::c10::Float8_e5m2fnuz(
-        0x01, standalone::c10::Float8_e5m2fnuz::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2fnuz
+  denorm_min() {
+    return executorch::backends::aoti::slim::c10::Float8_e5m2fnuz(
+        0x01,
+        executorch::backends::aoti::slim::c10::Float8_e5m2fnuz::from_bits());
   }
 };
 
diff --git a/backends/aoti/slim/c10/util/Float8_e5m2fnuz.h b/backends/aoti/slim/c10/util/Float8_e5m2fnuz.h
index c16e5613202..66c2427c8ac 100644
--- a/backends/aoti/slim/c10/util/Float8_e5m2fnuz.h
+++ b/backends/aoti/slim/c10/util/Float8_e5m2fnuz.h
@@ -31,7 +31,7 @@
 #include <iosfwd>
 #include <ostream>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 namespace detail {
 
@@ -133,6 +133,6 @@ inline std::ostream& operator<<(
   return out;
 }
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 #include <executorch/backends/aoti/slim/c10/util/Float8_e5m2fnuz-inl.h> // IWYU pragma: keep
diff --git a/backends/aoti/slim/c10/util/Float8_e8m0fnu-inl.h b/backends/aoti/slim/c10/util/Float8_e8m0fnu-inl.h
index f510ca551b8..4e35e04bc22 100644
--- a/backends/aoti/slim/c10/util/Float8_e8m0fnu-inl.h
+++ b/backends/aoti/slim/c10/util/Float8_e8m0fnu-inl.h
@@ -11,7 +11,7 @@ STANDALONE_CLANG_DIAGNOSTIC_PUSH()
 STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
 #endif
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 /// Constructors
 
@@ -25,18 +25,20 @@ inline STANDALONE_HOST_DEVICE Float8_e8m0fnu::operator float() const {
 
   // if exponent is zero, need to special case to return 2^-127 instead of zero
   if (x == 0) {
-    return standalone::c10::detail::fp32_from_bits(0x00400000);
+    return executorch::backends::aoti::slim::c10::detail::fp32_from_bits(
+        0x00400000);
   }
 
   // if exponent is NaN, need to special case to return properly encoded NaN
   if (isnan()) {
-    return standalone::c10::detail::fp32_from_bits(0x7f800001);
+    return executorch::backends::aoti::slim::c10::detail::fp32_from_bits(
+        0x7f800001);
   }
 
   // leave sign at 0, set the exponent bits, leave stored mantissa at 0
   uint32_t res = x << 23;
 
-  return standalone::c10::detail::fp32_from_bits(res);
+  return executorch::backends::aoti::slim::c10::detail::fp32_from_bits(res);
 }
 
 /// Special values helper
@@ -46,14 +48,15 @@ inline STANDALONE_HOST_DEVICE bool Float8_e8m0fnu::isnan() const {
 }
 
 /// NOTE: we do not define comparisons directly and instead rely on the implicit
-/// conversion from standalone::c10::Float8_e8m0fnu to float.
+/// conversion from executorch::backends::aoti::slim::c10::Float8_e8m0fnu to
+/// float.
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 namespace std {
 
 template <>
-class numeric_limits<standalone::c10::Float8_e8m0fnu> {
+class numeric_limits<executorch::backends::aoti::slim::c10::Float8_e8m0fnu> {
  public:
   static constexpr bool is_specialized = true;
   static constexpr bool is_signed = false;
@@ -79,37 +82,47 @@ class numeric_limits<standalone::c10::Float8_e8m0fnu> {
   static constexpr auto traps = numeric_limits<float>::traps;
   static constexpr auto tinyness_before = false;
 
-  static constexpr standalone::c10::Float8_e8m0fnu min() {
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e8m0fnu min() {
     // 2^-127
-    return standalone::c10::Float8_e8m0fnu(
-        0b00000000, standalone::c10::Float8_e8m0fnu::from_bits());
+    return executorch::backends::aoti::slim::c10::Float8_e8m0fnu(
+        0b00000000,
+        executorch::backends::aoti::slim::c10::Float8_e8m0fnu::from_bits());
   }
-  static constexpr standalone::c10::Float8_e8m0fnu lowest() {
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e8m0fnu
+  lowest() {
     // 2^-127
-    return standalone::c10::Float8_e8m0fnu(
-        0b00000000, standalone::c10::Float8_e8m0fnu::from_bits());
+    return executorch::backends::aoti::slim::c10::Float8_e8m0fnu(
+        0b00000000,
+        executorch::backends::aoti::slim::c10::Float8_e8m0fnu::from_bits());
   }
-  static constexpr standalone::c10::Float8_e8m0fnu max() {
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e8m0fnu max() {
     // 254 biased, which is 127 unbiased, so 2^127
-    return standalone::c10::Float8_e8m0fnu(
-        0b11111110, standalone::c10::Float8_e8m0fnu::from_bits());
+    return executorch::backends::aoti::slim::c10::Float8_e8m0fnu(
+        0b11111110,
+        executorch::backends::aoti::slim::c10::Float8_e8m0fnu::from_bits());
   }
-  static constexpr standalone::c10::Float8_e8m0fnu epsilon() {
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e8m0fnu
+  epsilon() {
     // according to https://en.cppreference.com/w/cpp/types/numeric_limits, this
     // is "the difference between 1.0 and the next representable value of the
     // given floating-point type". The next representable value is 2.0, so the
     // difference is 1.0 which is 2^0. 0 unbiased is 127 biased.
-    return standalone::c10::Float8_e8m0fnu(
-        0b01111111, standalone::c10::Float8_e8m0fnu::from_bits());
+    return executorch::backends::aoti::slim::c10::Float8_e8m0fnu(
+        0b01111111,
+        executorch::backends::aoti::slim::c10::Float8_e8m0fnu::from_bits());
   }
-  static constexpr standalone::c10::Float8_e8m0fnu round_error() {
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e8m0fnu
+  round_error() {
     // 0.5 in float, which is 2^-1, and -1 + 127 = 126
-    return standalone::c10::Float8_e8m0fnu(
-        0b01111110, standalone::c10::Float8_e8m0fnu::from_bits());
+    return executorch::backends::aoti::slim::c10::Float8_e8m0fnu(
+        0b01111110,
+        executorch::backends::aoti::slim::c10::Float8_e8m0fnu::from_bits());
   }
-  static constexpr standalone::c10::Float8_e8m0fnu quiet_NaN() {
-    return standalone::c10::Float8_e8m0fnu(
-        0b11111111, standalone::c10::Float8_e8m0fnu::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Float8_e8m0fnu
+  quiet_NaN() {
+    return executorch::backends::aoti::slim::c10::Float8_e8m0fnu(
+        0b11111111,
+        executorch::backends::aoti::slim::c10::Float8_e8m0fnu::from_bits());
   }
 };
 
diff --git a/backends/aoti/slim/c10/util/Float8_e8m0fnu.h b/backends/aoti/slim/c10/util/Float8_e8m0fnu.h
index 2e2e46d627a..0f67705c510 100644
--- a/backends/aoti/slim/c10/util/Float8_e8m0fnu.h
+++ b/backends/aoti/slim/c10/util/Float8_e8m0fnu.h
@@ -27,7 +27,7 @@
 #include <iosfwd>
 #include <ostream>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 namespace detail {
 
@@ -38,7 +38,8 @@ namespace detail {
 inline STANDALONE_HOST_DEVICE uint8_t fp8e8m0fnu_from_fp32_value(float f) {
   // TODO(#146647): maybe rewrite without control flow
 
-  uint32_t f_bits = standalone::c10::detail::fp32_to_bits(f);
+  uint32_t f_bits =
+      executorch::backends::aoti::slim::c10::detail::fp32_to_bits(f);
 
   // extract the exponent
   uint32_t exponent = (f_bits >> 23) & 0b11111111;
@@ -114,6 +115,6 @@ inline std::ostream& operator<<(
   return out;
 }
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 #include <executorch/backends/aoti/slim/c10/util/Float8_e8m0fnu-inl.h> // IWYU pragma: keep
diff --git a/backends/aoti/slim/c10/util/Float8_fnuz_cvt.h b/backends/aoti/slim/c10/util/Float8_fnuz_cvt.h
index 00bfa8cd8fc..49bcaad6842 100644
--- a/backends/aoti/slim/c10/util/Float8_fnuz_cvt.h
+++ b/backends/aoti/slim/c10/util/Float8_fnuz_cvt.h
@@ -8,7 +8,7 @@
 #include <sycl/sycl.hpp>
 #endif
 
-namespace standalone::c10::detail {
+namespace executorch::backends::aoti::slim::c10::detail {
 
 /*
  * Convert a 8-bit floating-point number in either f8 E4M3FNUZ or bf8 E5M2FNUZ
@@ -61,4 +61,4 @@ inline STANDALONE_HOST_DEVICE float fp8_fnuz_to_fp32_value(uint8_t x) {
   return fp32_from_bits(retval);
 }
 
-} // namespace standalone::c10::detail
+} // namespace executorch::backends::aoti::slim::c10::detail
diff --git a/backends/aoti/slim/c10/util/Half-inl.h b/backends/aoti/slim/c10/util/Half-inl.h
index 05fa6349f81..f7b25c0ebe0 100644
--- a/backends/aoti/slim/c10/util/Half-inl.h
+++ b/backends/aoti/slim/c10/util/Half-inl.h
@@ -31,7 +31,7 @@ STANDALONE_CLANG_DIAGNOSTIC_PUSH()
 STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
 #endif
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 #if defined(__aarch64__) && !defined(__CUDACC__)
 /// Constructors
@@ -46,7 +46,8 @@ inline STANDALONE_HOST_DEVICE Half::Half(float value)
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
       x(__half_as_short(__float2half(value)))
 #elif defined(__SYCL_DEVICE_ONLY__)
-      x(standalone::c10::bit_cast<uint16_t>(sycl::half(value)))
+      x(executorch::backends::aoti::slim::c10::bit_cast<uint16_t>(
+          sycl::half(value)))
 #elif (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
     !defined(__APPLE__)
       x(at::vec::float2half_scalar(value))
@@ -62,7 +63,7 @@ inline STANDALONE_HOST_DEVICE Half::operator float() const {
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   return __half2float(*reinterpret_cast<const __half*>(&x));
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return float(standalone::c10::bit_cast<sycl::half>(x));
+  return float(executorch::backends::aoti::slim::c10::bit_cast<sycl::half>(x));
 #elif (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
     !defined(__APPLE__)
   return at::vec::half2float_scalar(x);
@@ -127,7 +128,7 @@ inline STANDALONE_HOST_DEVICE Half operator-(const Half& a) {
     defined(__HIP_DEVICE_COMPILE__)
   return __hneg(a);
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return -standalone::c10::bit_cast<sycl::half>(a);
+  return -executorch::backends::aoti::slim::c10::bit_cast<sycl::half>(a);
 #else
   return -static_cast<float>(a);
 #endif
@@ -283,14 +284,14 @@ inline STANDALONE_HOST_DEVICE Half operator/(int64_t a, Half b) {
 }
 
 /// NOTE: we do not define comparisons directly and instead rely on the implicit
-/// conversion from standalone::c10::Half to float.
+/// conversion from executorch::backends::aoti::slim::c10::Half to float.
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 namespace std {
 
 template <>
-class numeric_limits<standalone::c10::Half> {
+class numeric_limits<executorch::backends::aoti::slim::c10::Half> {
  public:
   static constexpr bool is_specialized = true;
   static constexpr bool is_signed = true;
@@ -317,32 +318,41 @@ class numeric_limits<standalone::c10::Half> {
   static constexpr auto traps = numeric_limits<float>::traps;
   static constexpr auto tinyness_before =
       numeric_limits<float>::tinyness_before;
-  static constexpr standalone::c10::Half min() {
-    return standalone::c10::Half(0x0400, standalone::c10::Half::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Half min() {
+    return executorch::backends::aoti::slim::c10::Half(
+        0x0400, executorch::backends::aoti::slim::c10::Half::from_bits());
   }
-  static constexpr standalone::c10::Half lowest() {
-    return standalone::c10::Half(0xFBFF, standalone::c10::Half::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Half lowest() {
+    return executorch::backends::aoti::slim::c10::Half(
+        0xFBFF, executorch::backends::aoti::slim::c10::Half::from_bits());
   }
-  static constexpr standalone::c10::Half max() {
-    return standalone::c10::Half(0x7BFF, standalone::c10::Half::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Half max() {
+    return executorch::backends::aoti::slim::c10::Half(
+        0x7BFF, executorch::backends::aoti::slim::c10::Half::from_bits());
   }
-  static constexpr standalone::c10::Half epsilon() {
-    return standalone::c10::Half(0x1400, standalone::c10::Half::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Half epsilon() {
+    return executorch::backends::aoti::slim::c10::Half(
+        0x1400, executorch::backends::aoti::slim::c10::Half::from_bits());
   }
-  static constexpr standalone::c10::Half round_error() {
-    return standalone::c10::Half(0x3800, standalone::c10::Half::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Half round_error() {
+    return executorch::backends::aoti::slim::c10::Half(
+        0x3800, executorch::backends::aoti::slim::c10::Half::from_bits());
   }
-  static constexpr standalone::c10::Half infinity() {
-    return standalone::c10::Half(0x7C00, standalone::c10::Half::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Half infinity() {
+    return executorch::backends::aoti::slim::c10::Half(
+        0x7C00, executorch::backends::aoti::slim::c10::Half::from_bits());
   }
-  static constexpr standalone::c10::Half quiet_NaN() {
-    return standalone::c10::Half(0x7E00, standalone::c10::Half::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Half quiet_NaN() {
+    return executorch::backends::aoti::slim::c10::Half(
+        0x7E00, executorch::backends::aoti::slim::c10::Half::from_bits());
   }
-  static constexpr standalone::c10::Half signaling_NaN() {
-    return standalone::c10::Half(0x7D00, standalone::c10::Half::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Half signaling_NaN() {
+    return executorch::backends::aoti::slim::c10::Half(
+        0x7D00, executorch::backends::aoti::slim::c10::Half::from_bits());
   }
-  static constexpr standalone::c10::Half denorm_min() {
-    return standalone::c10::Half(0x0001, standalone::c10::Half::from_bits());
+  static constexpr executorch::backends::aoti::slim::c10::Half denorm_min() {
+    return executorch::backends::aoti::slim::c10::Half(
+        0x0001, executorch::backends::aoti::slim::c10::Half::from_bits());
   }
 };
 
diff --git a/backends/aoti/slim/c10/util/Half.h b/backends/aoti/slim/c10/util/Half.h
index 86f8d8683e0..26597d23e53 100644
--- a/backends/aoti/slim/c10/util/Half.h
+++ b/backends/aoti/slim/c10/util/Half.h
@@ -61,7 +61,7 @@
 #endif // __x86_64__ || _M_X64 || __i386 || _M_IX86
 #endif // __GNUC__ || __clang__
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 namespace detail {
 
@@ -359,11 +359,11 @@ inline uint16_t fp16_ieee_from_fp32_value(float f) {
 
 #if defined(__aarch64__) && !defined(__CUDACC__)
 inline float16_t fp16_from_bits(uint16_t h) {
-  return standalone::c10::bit_cast<float16_t>(h);
+  return executorch::backends::aoti::slim::c10::bit_cast<float16_t>(h);
 }
 
 inline uint16_t fp16_to_bits(float16_t f) {
-  return standalone::c10::bit_cast<uint16_t>(f);
+  return executorch::backends::aoti::slim::c10::bit_cast<uint16_t>(f);
 }
 
 // According to https://godbolt.org/z/frExdbsWG it would translate to single
@@ -419,6 +419,6 @@ inline std::ostream& operator<<(std::ostream& out, const Half& value) {
   return out;
 }
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 #include <executorch/backends/aoti/slim/c10/util/Half-inl.h> // IWYU pragma: keep
diff --git a/backends/aoti/slim/c10/util/StringUtil.h b/backends/aoti/slim/c10/util/StringUtil.h
index ff7c591e734..8a696322716 100644
--- a/backends/aoti/slim/c10/util/StringUtil.h
+++ b/backends/aoti/slim/c10/util/StringUtil.h
@@ -3,7 +3,7 @@
 #include <sstream>
 #include <string>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 template <class Container>
 inline std::string Join(const std::string& delimiter, const Container& v) {
   std::stringstream s;
@@ -13,4 +13,4 @@ inline std::string Join(const std::string& delimiter, const Container& v) {
   }
   return std::move(s).str();
 }
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/util/TypeCast.h b/backends/aoti/slim/c10/util/TypeCast.h
index cfaaaebec95..e3d65a7ef16 100644
--- a/backends/aoti/slim/c10/util/TypeCast.h
+++ b/backends/aoti/slim/c10/util/TypeCast.h
@@ -20,7 +20,7 @@ STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion")
 STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
 #endif
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 template <typename dest_t, typename src_t>
 struct needs_real {
@@ -103,66 +103,76 @@ struct static_cast_with_inter_type<uint8_t, src_t> {
 
 template <>
 struct static_cast_with_inter_type<
-    standalone::c10::complex<standalone::c10::Half>,
-    standalone::c10::BFloat16> {
+    executorch::backends::aoti::slim::c10::complex<
+        executorch::backends::aoti::slim::c10::Half>,
+    executorch::backends::aoti::slim::c10::BFloat16> {
   STANDALONE_HOST_DEVICE
-  __ubsan_ignore_undefined__ static inline standalone::c10::complex<
-      standalone::c10::Half>
-  apply(standalone::c10::BFloat16 src) {
-    return static_cast<standalone::c10::complex<standalone::c10::Half>>(
-        standalone::c10::complex<float>{src});
+  __ubsan_ignore_undefined__ static inline executorch::backends::aoti::slim::
+      c10::complex<executorch::backends::aoti::slim::c10::Half>
+      apply(executorch::backends::aoti::slim::c10::BFloat16 src) {
+    return static_cast<executorch::backends::aoti::slim::c10::complex<
+        executorch::backends::aoti::slim::c10::Half>>(
+        executorch::backends::aoti::slim::c10::complex<float>{src});
   }
 };
 
 template <>
 struct static_cast_with_inter_type<
-    standalone::c10::complex<standalone::c10::Half>,
-    standalone::c10::Float8_e5m2> {
+    executorch::backends::aoti::slim::c10::complex<
+        executorch::backends::aoti::slim::c10::Half>,
+    executorch::backends::aoti::slim::c10::Float8_e5m2> {
   STANDALONE_HOST_DEVICE
-  __ubsan_ignore_undefined__ static inline standalone::c10::complex<
-      standalone::c10::Half>
-  apply(standalone::c10::Float8_e5m2 src) {
-    return static_cast<standalone::c10::complex<standalone::c10::Half>>(
-        standalone::c10::complex<float>{src});
+  __ubsan_ignore_undefined__ static inline executorch::backends::aoti::slim::
+      c10::complex<executorch::backends::aoti::slim::c10::Half>
+      apply(executorch::backends::aoti::slim::c10::Float8_e5m2 src) {
+    return static_cast<executorch::backends::aoti::slim::c10::complex<
+        executorch::backends::aoti::slim::c10::Half>>(
+        executorch::backends::aoti::slim::c10::complex<float>{src});
   }
 };
 
 template <>
 struct static_cast_with_inter_type<
-    standalone::c10::complex<standalone::c10::Half>,
-    standalone::c10::Float8_e5m2fnuz> {
+    executorch::backends::aoti::slim::c10::complex<
+        executorch::backends::aoti::slim::c10::Half>,
+    executorch::backends::aoti::slim::c10::Float8_e5m2fnuz> {
   STANDALONE_HOST_DEVICE
-  __ubsan_ignore_undefined__ static inline standalone::c10::complex<
-      standalone::c10::Half>
-  apply(standalone::c10::Float8_e5m2fnuz src) {
-    return static_cast<standalone::c10::complex<standalone::c10::Half>>(
-        standalone::c10::complex<float>{src});
+  __ubsan_ignore_undefined__ static inline executorch::backends::aoti::slim::
+      c10::complex<executorch::backends::aoti::slim::c10::Half>
+      apply(executorch::backends::aoti::slim::c10::Float8_e5m2fnuz src) {
+    return static_cast<executorch::backends::aoti::slim::c10::complex<
+        executorch::backends::aoti::slim::c10::Half>>(
+        executorch::backends::aoti::slim::c10::complex<float>{src});
   }
 };
 
 template <>
 struct static_cast_with_inter_type<
-    standalone::c10::complex<standalone::c10::Half>,
-    standalone::c10::Float8_e4m3fn> {
+    executorch::backends::aoti::slim::c10::complex<
+        executorch::backends::aoti::slim::c10::Half>,
+    executorch::backends::aoti::slim::c10::Float8_e4m3fn> {
   STANDALONE_HOST_DEVICE
-  __ubsan_ignore_undefined__ static inline standalone::c10::complex<
-      standalone::c10::Half>
-  apply(standalone::c10::Float8_e4m3fn src) {
-    return static_cast<standalone::c10::complex<standalone::c10::Half>>(
-        standalone::c10::complex<float>{src});
+  __ubsan_ignore_undefined__ static inline executorch::backends::aoti::slim::
+      c10::complex<executorch::backends::aoti::slim::c10::Half>
+      apply(executorch::backends::aoti::slim::c10::Float8_e4m3fn src) {
+    return static_cast<executorch::backends::aoti::slim::c10::complex<
+        executorch::backends::aoti::slim::c10::Half>>(
+        executorch::backends::aoti::slim::c10::complex<float>{src});
   }
 };
 
 template <>
 struct static_cast_with_inter_type<
-    standalone::c10::complex<standalone::c10::Half>,
-    standalone::c10::Float8_e4m3fnuz> {
+    executorch::backends::aoti::slim::c10::complex<
+        executorch::backends::aoti::slim::c10::Half>,
+    executorch::backends::aoti::slim::c10::Float8_e4m3fnuz> {
   STANDALONE_HOST_DEVICE
-  __ubsan_ignore_undefined__ static inline standalone::c10::complex<
-      standalone::c10::Half>
-  apply(standalone::c10::Float8_e4m3fnuz src) {
-    return static_cast<standalone::c10::complex<standalone::c10::Half>>(
-        standalone::c10::complex<float>{src});
+  __ubsan_ignore_undefined__ static inline executorch::backends::aoti::slim::
+      c10::complex<executorch::backends::aoti::slim::c10::Half>
+      apply(executorch::backends::aoti::slim::c10::Float8_e4m3fnuz src) {
+    return static_cast<executorch::backends::aoti::slim::c10::complex<
+        executorch::backends::aoti::slim::c10::Half>>(
+        executorch::backends::aoti::slim::c10::complex<float>{src});
   }
 };
 
@@ -170,40 +180,47 @@ struct static_cast_with_inter_type<
 // based off our apply macros?
 template <>
 struct static_cast_with_inter_type<
-    standalone::c10::complex<standalone::c10::Half>,
-    standalone::c10::Float8_e8m0fnu> {
+    executorch::backends::aoti::slim::c10::complex<
+        executorch::backends::aoti::slim::c10::Half>,
+    executorch::backends::aoti::slim::c10::Float8_e8m0fnu> {
   STANDALONE_HOST_DEVICE
-  __ubsan_ignore_undefined__ static inline standalone::c10::complex<
-      standalone::c10::Half>
-  apply(standalone::c10::Float8_e8m0fnu src) {
-    return static_cast<standalone::c10::complex<standalone::c10::Half>>(
-        standalone::c10::complex<float>{src});
+  __ubsan_ignore_undefined__ static inline executorch::backends::aoti::slim::
+      c10::complex<executorch::backends::aoti::slim::c10::Half>
+      apply(executorch::backends::aoti::slim::c10::Float8_e8m0fnu src) {
+    return static_cast<executorch::backends::aoti::slim::c10::complex<
+        executorch::backends::aoti::slim::c10::Half>>(
+        executorch::backends::aoti::slim::c10::complex<float>{src});
   }
 };
 
 template <>
 struct static_cast_with_inter_type<
-    standalone::c10::complex<standalone::c10::Half>,
-    standalone::c10::Half> {
+    executorch::backends::aoti::slim::c10::complex<
+        executorch::backends::aoti::slim::c10::Half>,
+    executorch::backends::aoti::slim::c10::Half> {
   STANDALONE_HOST_DEVICE
-  __ubsan_ignore_undefined__ static inline standalone::c10::complex<
-      standalone::c10::Half>
-  apply(standalone::c10::Half src) {
-    return static_cast<standalone::c10::complex<standalone::c10::Half>>(
-        standalone::c10::complex<float>{src});
+  __ubsan_ignore_undefined__ static inline executorch::backends::aoti::slim::
+      c10::complex<executorch::backends::aoti::slim::c10::Half>
+      apply(executorch::backends::aoti::slim::c10::Half src) {
+    return static_cast<executorch::backends::aoti::slim::c10::complex<
+        executorch::backends::aoti::slim::c10::Half>>(
+        executorch::backends::aoti::slim::c10::complex<float>{src});
   }
 };
 
 template <>
 struct static_cast_with_inter_type<
-    standalone::c10::complex<standalone::c10::Half>,
-    standalone::c10::complex<double>> {
+    executorch::backends::aoti::slim::c10::complex<
+        executorch::backends::aoti::slim::c10::Half>,
+    executorch::backends::aoti::slim::c10::complex<double>> {
   STANDALONE_HOST_DEVICE
-  __ubsan_ignore_undefined__ static inline standalone::c10::complex<
-      standalone::c10::Half>
-  apply(standalone::c10::complex<double> src) {
-    return static_cast<standalone::c10::complex<standalone::c10::Half>>(
-        static_cast<standalone::c10::complex<float>>(src));
+  __ubsan_ignore_undefined__ static inline executorch::backends::aoti::slim::
+      c10::complex<executorch::backends::aoti::slim::c10::Half>
+      apply(executorch::backends::aoti::slim::c10::complex<double> src) {
+    return static_cast<executorch::backends::aoti::slim::c10::complex<
+        executorch::backends::aoti::slim::c10::Half>>(
+        static_cast<executorch::backends::aoti::slim::c10::complex<float>>(
+            src));
   }
 };
 
@@ -229,7 +246,7 @@ To checked_convert(From f, const char* name) {
   return convert<To, From>(f);
 }
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 STANDALONE_CLANG_DIAGNOSTIC_POP()
 
diff --git a/backends/aoti/slim/c10/util/TypeSafeSignMath.h b/backends/aoti/slim/c10/util/TypeSafeSignMath.h
index 276b1cee7d0..7e23f64a39e 100644
--- a/backends/aoti/slim/c10/util/TypeSafeSignMath.h
+++ b/backends/aoti/slim/c10/util/TypeSafeSignMath.h
@@ -13,7 +13,7 @@ STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wstring-conversion")
 STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
 #endif
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 /// Returns false since we cannot have x < 0 if x is unsigned.
 template <typename T>
@@ -33,7 +33,8 @@ inline constexpr bool is_negative(const T& x, std::false_type /*is_unsigned*/) {
 /// NOTE: Will fail on an unsigned custom type
 ///       For the most part it's possible to fix this if
 ///       the custom type has a constexpr constructor.
-///       However, notably, standalone::c10::Half does not :-(
+///       However, notably, executorch::backends::aoti::slim::c10::Half does not
+///       :-(
 template <typename T>
 inline constexpr bool is_negative(const T& x) {
   return is_negative(x, std::is_unsigned<T>());
@@ -55,7 +56,8 @@ inline constexpr int signum(const T& x, std::false_type /*is_unsigned*/) {
 /// NOTE: Will fail on an unsigned custom type
 ///       For the most part it's possible to fix this if
 ///       the custom type has a constexpr constructor.
-///       However, notably, standalone::c10::Half does not :-(
+///       However, notably, executorch::backends::aoti::slim::c10::Half does not
+///       :-(
 template <typename T>
 inline constexpr int signum(const T& x) {
   return signum(x, std::is_unsigned<T>());
@@ -129,13 +131,14 @@ inline constexpr bool less_than_lowest(
 /// NOTE: Will fail on an unsigned custom type
 ///       For the most part it's possible to fix this if
 ///       the custom type has a constexpr constructor.
-///       However, notably, standalone::c10::Half does not :
+///       However, notably, executorch::backends::aoti::slim::c10::Half does not
+///       :
 template <typename Limit, typename T>
 inline constexpr bool less_than_lowest(const T& x) {
   return less_than_lowest<Limit>(
       x, std::is_unsigned<Limit>(), std::is_unsigned<T>());
 }
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 STANDALONE_CLANG_DIAGNOSTIC_POP()
diff --git a/backends/aoti/slim/c10/util/accumulate.h b/backends/aoti/slim/c10/util/accumulate.h
index 4972dd9826a..578c6246b29 100644
--- a/backends/aoti/slim/c10/util/accumulate.h
+++ b/backends/aoti/slim/c10/util/accumulate.h
@@ -11,7 +11,7 @@
 #include <type_traits>
 #include <utility>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 /// Sum of a list of integers; accumulates into the int64_t datatype
 template <
@@ -122,4 +122,4 @@ inline int64_t numelements_between_dim(int k, int l, const C& dims) {
   return multiply_integers(cbegin, cend);
 }
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/util/bit_cast.h b/backends/aoti/slim/c10/util/bit_cast.h
index 765ec641486..5a1e1208acf 100644
--- a/backends/aoti/slim/c10/util/bit_cast.h
+++ b/backends/aoti/slim/c10/util/bit_cast.h
@@ -11,7 +11,7 @@
 #endif // __has_include(<bit>) && (__cplusplus >= 202002L ||
        // (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L))
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 #if STANDALONE_HAVE_STD_BIT_CAST
 using std::bit_cast;
@@ -41,4 +41,4 @@ bit_cast(const From& src) noexcept {
 #endif // STANDALONE_HAVE_STD_BIT_CAST
 #undef STANDALONE_HAVE_STD_BIT_CAST
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/util/bits.h b/backends/aoti/slim/c10/util/bits.h
index 2d365463a01..d04f88dafc8 100644
--- a/backends/aoti/slim/c10/util/bits.h
+++ b/backends/aoti/slim/c10/util/bits.h
@@ -3,7 +3,7 @@
 
 #include <executorch/backends/aoti/slim/c10/macros/Macros.h>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 /**
  * bits1x8 is an uninterpreted dtype of a tensor with 1 bit (packed to byte
@@ -58,4 +58,4 @@ struct alignas(2) bits16 {
   STANDALONE_HOST_DEVICE explicit bits16(uint16_t val) : val_(val) {}
 };
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/util/complex.h b/backends/aoti/slim/c10/util/complex.h
index 988e446b3e4..b48ef792ed7 100644
--- a/backends/aoti/slim/c10/util/complex.h
+++ b/backends/aoti/slim/c10/util/complex.h
@@ -17,19 +17,19 @@ STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion")
 STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wfloat-conversion")
 #endif
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
-// standalone::c10::complex is an implementation of complex numbers that aims
-// to work on all devices supported by PyTorch
+// executorch::backends::aoti::slim::c10::complex is an implementation of
+// complex numbers that aims to work on all devices supported by PyTorch
 //
 // Most of the APIs duplicates std::complex
 // Reference: https://en.cppreference.com/w/cpp/numeric/complex
 //
 // [NOTE: Complex Operator Unification]
 // Operators currently use a mix of std::complex, thrust::complex, and
-// standalone::c10::complex internally. The end state is that all operators
-// will use standalone::c10::complex internally.  Until then, there may be
-// some hacks to support all variants.
+// executorch::backends::aoti::slim::c10::complex internally. The end state is
+// that all operators will use executorch::backends::aoti::slim::c10::complex
+// internally.  Until then, there may be some hacks to support all variants.
 //
 //
 // [Note on Constructors]
@@ -89,9 +89,9 @@ namespace standalone::c10 {
 //
 // std::complex has custom literals `i`, `if` and `il` defined in namespace
 // `std::literals::complex_literals`. We define our own custom literals in the
-// namespace `standalone::c10::complex_literals`. Our custom literals does not
-// follow the same behavior as in std::complex, instead, we define _if, _id to
-// construct float/double complex literals.
+// namespace `executorch::backends::aoti::slim::c10::complex_literals`. Our
+// custom literals does not follow the same behavior as in std::complex,
+// instead, we define _if, _id to construct float/double complex literals.
 //
 //
 // [real() and imag()]
@@ -138,9 +138,11 @@ namespace standalone::c10 {
 //
 //
 //
-// TODO(@zasdfgbnm): standalone::c10::complex<standalone::c10::Half> is not
-// currently supported, because:
-//  - lots of members and functions of standalone::c10::Half are not constexpr
+// TODO(@zasdfgbnm):
+// executorch::backends::aoti::slim::c10::complex<executorch::backends::aoti::slim::c10::Half>
+// is not currently supported, because:
+//  - lots of members and functions of
+//  executorch::backends::aoti::slim::c10::Half are not constexpr
 //  - thrust::complex only support float and double
 
 template <typename T>
@@ -166,7 +168,8 @@ struct alignas(sizeof(T) * 2) complex {
 #endif
 
   // Use SFINAE to specialize casting constructor for
-  // standalone::c10::complex<float> and standalone::c10::complex<double>
+  // executorch::backends::aoti::slim::c10::complex<float> and
+  // executorch::backends::aoti::slim::c10::complex<double>
   template <typename U = T>
   STANDALONE_HOST_DEVICE explicit constexpr complex(
       const std::enable_if_t<std::is_same_v<U, float>, complex<double>>& other)
@@ -430,69 +433,69 @@ constexpr complex<T> operator/(const T& lhs, const complex<T>& rhs) {
   return result /= rhs;
 }
 
-// Define operators between integral scalars and standalone::c10::complex.
-// std::complex does not support this when T is a floating-point number. This is
-// useful because it saves a lot of "static_cast" when operate a complex and an
-// integer. This makes the code both less verbose and potentially more
-// efficient.
+// Define operators between integral scalars and
+// executorch::backends::aoti::slim::c10::complex. std::complex does not support
+// this when T is a floating-point number. This is useful because it saves a lot
+// of "static_cast" when operate a complex and an integer. This makes the code
+// both less verbose and potentially more efficient.
 #define COMPLEX_INTEGER_OP_TEMPLATE_CONDITION                 \
   typename std::enable_if_t<                                  \
       std::is_floating_point_v<fT> && std::is_integral_v<iT>, \
       int> = 0
 
 template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
-constexpr standalone::c10::complex<fT> operator+(
-    const standalone::c10::complex<fT>& a,
+constexpr executorch::backends::aoti::slim::c10::complex<fT> operator+(
+    const executorch::backends::aoti::slim::c10::complex<fT>& a,
     const iT& b) {
   return a + static_cast<fT>(b);
 }
 
 template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
-constexpr standalone::c10::complex<fT> operator+(
+constexpr executorch::backends::aoti::slim::c10::complex<fT> operator+(
     const iT& a,
-    const standalone::c10::complex<fT>& b) {
+    const executorch::backends::aoti::slim::c10::complex<fT>& b) {
   return static_cast<fT>(a) + b;
 }
 
 template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
-constexpr standalone::c10::complex<fT> operator-(
-    const standalone::c10::complex<fT>& a,
+constexpr executorch::backends::aoti::slim::c10::complex<fT> operator-(
+    const executorch::backends::aoti::slim::c10::complex<fT>& a,
     const iT& b) {
   return a - static_cast<fT>(b);
 }
 
 template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
-constexpr standalone::c10::complex<fT> operator-(
+constexpr executorch::backends::aoti::slim::c10::complex<fT> operator-(
     const iT& a,
-    const standalone::c10::complex<fT>& b) {
+    const executorch::backends::aoti::slim::c10::complex<fT>& b) {
   return static_cast<fT>(a) - b;
 }
 
 template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
-constexpr standalone::c10::complex<fT> operator*(
-    const standalone::c10::complex<fT>& a,
+constexpr executorch::backends::aoti::slim::c10::complex<fT> operator*(
+    const executorch::backends::aoti::slim::c10::complex<fT>& a,
     const iT& b) {
   return a * static_cast<fT>(b);
 }
 
 template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
-constexpr standalone::c10::complex<fT> operator*(
+constexpr executorch::backends::aoti::slim::c10::complex<fT> operator*(
     const iT& a,
-    const standalone::c10::complex<fT>& b) {
+    const executorch::backends::aoti::slim::c10::complex<fT>& b) {
   return static_cast<fT>(a) * b;
 }
 
 template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
-constexpr standalone::c10::complex<fT> operator/(
-    const standalone::c10::complex<fT>& a,
+constexpr executorch::backends::aoti::slim::c10::complex<fT> operator/(
+    const executorch::backends::aoti::slim::c10::complex<fT>& a,
     const iT& b) {
   return a / static_cast<fT>(b);
 }
 
 template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
-constexpr standalone::c10::complex<fT> operator/(
+constexpr executorch::backends::aoti::slim::c10::complex<fT> operator/(
     const iT& a,
-    const standalone::c10::complex<fT>& b) {
+    const executorch::backends::aoti::slim::c10::complex<fT>& b) {
   return static_cast<fT>(a) / b;
 }
 
@@ -545,7 +548,7 @@ std::basic_istream<CharT, Traits>& operator>>(
   return is;
 }
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 // std functions
 //
@@ -554,17 +557,18 @@ std::basic_istream<CharT, Traits>& operator>>(
 namespace std {
 
 template <typename T>
-constexpr T real(const standalone::c10::complex<T>& z) {
+constexpr T real(const executorch::backends::aoti::slim::c10::complex<T>& z) {
   return z.real();
 }
 
 template <typename T>
-constexpr T imag(const standalone::c10::complex<T>& z) {
+constexpr T imag(const executorch::backends::aoti::slim::c10::complex<T>& z) {
   return z.imag();
 }
 
 template <typename T>
-STANDALONE_HOST_DEVICE T abs(const standalone::c10::complex<T>& z) {
+STANDALONE_HOST_DEVICE T
+abs(const executorch::backends::aoti::slim::c10::complex<T>& z) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
   return thrust::abs(static_cast<thrust::complex<T>>(z));
 #else
@@ -579,14 +583,15 @@ STANDALONE_HOST_DEVICE T abs(const standalone::c10::complex<T>& z) {
 #endif
 
 template <typename T>
-STANDALONE_HOST_DEVICE T arg(const standalone::c10::complex<T>& z) {
+STANDALONE_HOST_DEVICE T
+arg(const executorch::backends::aoti::slim::c10::complex<T>& z) {
   return ROCm_Bug(std)::atan2(std::imag(z), std::real(z));
 }
 
 #undef ROCm_Bug
 
 template <typename T>
-constexpr T norm(const standalone::c10::complex<T>& z) {
+constexpr T norm(const executorch::backends::aoti::slim::c10::complex<T>& z) {
   return z.real() * z.real() + z.imag() * z.imag();
 }
 
@@ -596,11 +601,12 @@ constexpr T norm(const standalone::c10::complex<T>& z) {
 //   constexpr std::complex<double> conj( DoubleOrInteger z );
 //   constexpr std::complex<long double> conj( long double z );
 // These are not implemented
-// TODO(@zasdfgbnm): implement them as standalone::c10::conj
+// TODO(@zasdfgbnm): implement them as
+// executorch::backends::aoti::slim::c10::conj
 template <typename T>
-constexpr standalone::c10::complex<T> conj(
-    const standalone::c10::complex<T>& z) {
-  return standalone::c10::complex<T>(z.real(), -z.imag());
+constexpr executorch::backends::aoti::slim::c10::complex<T> conj(
+    const executorch::backends::aoti::slim::c10::complex<T>& z) {
+  return executorch::backends::aoti::slim::c10::complex<T>(z.real(), -z.imag());
 }
 
 // Thrust does not have complex --> complex version of thrust::proj,
@@ -608,11 +614,12 @@ constexpr standalone::c10::complex<T> conj(
 // TODO(@zasdfgbnm): implement it by ourselves
 
 // There is no standalone version of std::polar, because std::polar always
-// returns std::complex. Use standalone::c10::polar instead;
+// returns std::complex. Use executorch::backends::aoti::slim::c10::polar
+// instead;
 
 } // namespace std
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 template <typename T>
 STANDALONE_HOST_DEVICE complex<T> polar(const T& r, const T& theta = T()) {
@@ -639,12 +646,12 @@ struct alignas(4) complex<Half> {
       const Half& imag)
       : real_(real), imag_(imag) {}
   STANDALONE_HOST_DEVICE inline complex(
-      const standalone::c10::complex<float>& value)
+      const executorch::backends::aoti::slim::c10::complex<float>& value)
       : real_(value.real()), imag_(value.imag()) {}
 
   // Conversion operator
-  inline STANDALONE_HOST_DEVICE operator standalone::c10::complex<float>()
-      const {
+  inline STANDALONE_HOST_DEVICE
+  operator executorch::backends::aoti::slim::c10::complex<float>() const {
     return {real_, imag_};
   }
 
@@ -678,7 +685,7 @@ struct alignas(4) complex<Half> {
   }
 };
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 STANDALONE_CLANG_DIAGNOSTIC_POP()
 
diff --git a/backends/aoti/slim/c10/util/complex_math.h b/backends/aoti/slim/c10/util/complex_math.h
index 56fc84fe90b..3ada9db6f00 100644
--- a/backends/aoti/slim/c10/util/complex_math.h
+++ b/backends/aoti/slim/c10/util/complex_math.h
@@ -5,52 +5,52 @@
 
 #include <cmath>
 
-namespace standalone::c10::complex_math {
+namespace executorch::backends::aoti::slim::c10::complex_math {
 
 // Exponential functions
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> exp(
-    const standalone::c10::complex<T>& x) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+exp(const executorch::backends::aoti::slim::c10::complex<T>& x) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       thrust::exp(static_cast<thrust::complex<T>>(x)));
 #else
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       std::exp(static_cast<std::complex<T>>(x)));
 #endif
 }
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> log(
-    const standalone::c10::complex<T>& x) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+log(const executorch::backends::aoti::slim::c10::complex<T>& x) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       thrust::log(static_cast<thrust::complex<T>>(x)));
 #else
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       std::log(static_cast<std::complex<T>>(x)));
 #endif
 }
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> log10(
-    const standalone::c10::complex<T>& x) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+log10(const executorch::backends::aoti::slim::c10::complex<T>& x) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       thrust::log10(static_cast<thrust::complex<T>>(x)));
 #else
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       std::log10(static_cast<std::complex<T>>(x)));
 #endif
 }
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> log2(
-    const standalone::c10::complex<T>& x) {
-  const standalone::c10::complex<T> log2 =
-      standalone::c10::complex<T>(::log(2.0), 0.0);
-  return standalone::c10::complex_math::log(x) / log2;
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+log2(const executorch::backends::aoti::slim::c10::complex<T>& x) {
+  const executorch::backends::aoti::slim::c10::complex<T> log2 =
+      executorch::backends::aoti::slim::c10::complex<T>(::log(2.0), 0.0);
+  return executorch::backends::aoti::slim::c10::complex_math::log(x) / log2;
 }
 
 // Power functions
@@ -59,34 +59,36 @@ STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> log2(
     (defined(__GLIBCXX__) && !defined(_GLIBCXX11_USE_C99_COMPLEX))
 namespace _detail {
 template <typename T>
-standalone::c10::complex<T> compute_csqrt(
-    const standalone::c10::complex<T>& z) {
+executorch::backends::aoti::slim::c10::complex<T> compute_csqrt(
+    const executorch::backends::aoti::slim::c10::complex<T>& z) {
   constexpr auto half = T(.5);
 
   // Trust standard library to correctly handle infs and NaNs
   if (std::isinf(z.real()) || std::isinf(z.imag()) || std::isnan(z.real()) ||
       std::isnan(z.imag())) {
-    return static_cast<standalone::c10::complex<T>>(
+    return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
         std::sqrt(static_cast<std::complex<T>>(z)));
   }
 
   // Special case for square root of pure imaginary values
   if (z.real() == T(0)) {
     if (z.imag() == T(0)) {
-      return standalone::c10::complex<T>(T(0), z.imag());
+      return executorch::backends::aoti::slim::c10::complex<T>(T(0), z.imag());
     }
     auto v = std::sqrt(half * std::abs(z.imag()));
-    return standalone::c10::complex<T>(v, std::copysign(v, z.imag()));
+    return executorch::backends::aoti::slim::c10::complex<T>(
+        v, std::copysign(v, z.imag()));
   }
 
   // At this point, z is non-zero and finite
   if (z.real() >= 0.0) {
     auto t = std::sqrt((z.real() + std::abs(z)) * half);
-    return standalone::c10::complex<T>(t, half * (z.imag() / t));
+    return executorch::backends::aoti::slim::c10::complex<T>(
+        t, half * (z.imag() / t));
   }
 
   auto t = std::sqrt((-z.real() + std::abs(z)) * half);
-  return standalone::c10::complex<T>(
+  return executorch::backends::aoti::slim::c10::complex<T>(
       half * std::abs(z.imag() / t), std::copysign(t, z.imag()));
 }
 
@@ -95,58 +97,59 @@ standalone::c10::complex<T> compute_csqrt(
 // cacos(z).re = 2*atan2(sqrt(1-z).re(), sqrt(1+z).re())
 // cacos(z).im = asinh((sqrt(conj(1+z))*sqrt(1-z)).im())
 template <typename T>
-standalone::c10::complex<T> compute_cacos(
-    const standalone::c10::complex<T>& z) {
+executorch::backends::aoti::slim::c10::complex<T> compute_cacos(
+    const executorch::backends::aoti::slim::c10::complex<T>& z) {
   auto constexpr one = T(1);
   // Trust standard library to correctly handle infs and NaNs
   if (std::isinf(z.real()) || std::isinf(z.imag()) || std::isnan(z.real()) ||
       std::isnan(z.imag())) {
-    return static_cast<standalone::c10::complex<T>>(
+    return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
         std::acos(static_cast<std::complex<T>>(z)));
   }
-  auto a =
-      compute_csqrt(standalone::c10::complex<T>(one - z.real(), -z.imag()));
-  auto b = compute_csqrt(standalone::c10::complex<T>(one + z.real(), z.imag()));
-  auto c =
-      compute_csqrt(standalone::c10::complex<T>(one + z.real(), -z.imag()));
+  auto a = compute_csqrt(executorch::backends::aoti::slim::c10::complex<T>(
+      one - z.real(), -z.imag()));
+  auto b = compute_csqrt(executorch::backends::aoti::slim::c10::complex<T>(
+      one + z.real(), z.imag()));
+  auto c = compute_csqrt(executorch::backends::aoti::slim::c10::complex<T>(
+      one + z.real(), -z.imag()));
   auto r = T(2) * std::atan2(a.real(), b.real());
   // Explicitly unroll (a*c).imag()
   auto i = std::asinh(a.real() * c.imag() + a.imag() * c.real());
-  return standalone::c10::complex<T>(r, i);
+  return executorch::backends::aoti::slim::c10::complex<T>(r, i);
 }
 
-inline standalone::c10::complex<float> sqrt(
-    const standalone::c10::complex<float>& in) {
+inline executorch::backends::aoti::slim::c10::complex<float> sqrt(
+    const executorch::backends::aoti::slim::c10::complex<float>& in) {
   return compute_csqrt(in);
 }
 
-inline standalone::c10::complex<double> sqrt(
-    const standalone::c10::complex<double>& in) {
+inline executorch::backends::aoti::slim::c10::complex<double> sqrt(
+    const executorch::backends::aoti::slim::c10::complex<double>& in) {
   return compute_csqrt(in);
 }
 
-inline standalone::c10::complex<float> acos(
-    const standalone::c10::complex<float>& in) {
+inline executorch::backends::aoti::slim::c10::complex<float> acos(
+    const executorch::backends::aoti::slim::c10::complex<float>& in) {
   return compute_cacos(in);
 }
 
-inline standalone::c10::complex<double> acos(
-    const standalone::c10::complex<double>& in) {
+inline executorch::backends::aoti::slim::c10::complex<double> acos(
+    const executorch::backends::aoti::slim::c10::complex<double>& in) {
   return compute_cacos(in);
 }
 } // namespace _detail
 #endif
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> sqrt(
-    const standalone::c10::complex<T>& x) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+sqrt(const executorch::backends::aoti::slim::c10::complex<T>& x) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       thrust::sqrt(static_cast<thrust::complex<T>>(x)));
 #elif !(                        \
     defined(_LIBCPP_VERSION) || \
     (defined(__GLIBCXX__) && !defined(_GLIBCXX11_USE_C99_COMPLEX)))
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       std::sqrt(static_cast<std::complex<T>>(x)));
 #else
   return _detail::sqrt(x);
@@ -154,79 +157,84 @@ STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> sqrt(
 }
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> pow(
-    const standalone::c10::complex<T>& x,
-    const standalone::c10::complex<T>& y) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+pow(const executorch::backends::aoti::slim::c10::complex<T>& x,
+    const executorch::backends::aoti::slim::c10::complex<T>& y) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(thrust::pow(
-      static_cast<thrust::complex<T>>(x), static_cast<thrust::complex<T>>(y)));
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
+      thrust::pow(
+          static_cast<thrust::complex<T>>(x),
+          static_cast<thrust::complex<T>>(y)));
 #else
-  return static_cast<standalone::c10::complex<T>>(std::pow(
-      static_cast<std::complex<T>>(x), static_cast<std::complex<T>>(y)));
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
+      std::pow(
+          static_cast<std::complex<T>>(x), static_cast<std::complex<T>>(y)));
 #endif
 }
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> pow(
-    const standalone::c10::complex<T>& x,
-    const T& y) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+pow(const executorch::backends::aoti::slim::c10::complex<T>& x, const T& y) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       thrust::pow(static_cast<thrust::complex<T>>(x), y));
 #else
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       std::pow(static_cast<std::complex<T>>(x), y));
 #endif
 }
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> pow(
-    const T& x,
-    const standalone::c10::complex<T>& y) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+pow(const T& x, const executorch::backends::aoti::slim::c10::complex<T>& y) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       thrust::pow(x, static_cast<thrust::complex<T>>(y)));
 #else
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       std::pow(x, static_cast<std::complex<T>>(y)));
 #endif
 }
 
 template <typename T, typename U>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<decltype(T() * U())> pow(
-    const standalone::c10::complex<T>& x,
-    const standalone::c10::complex<U>& y) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<
+    decltype(T() * U())>
+pow(const executorch::backends::aoti::slim::c10::complex<T>& x,
+    const executorch::backends::aoti::slim::c10::complex<U>& y) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(thrust::pow(
-      static_cast<thrust::complex<T>>(x), static_cast<thrust::complex<T>>(y)));
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
+      thrust::pow(
+          static_cast<thrust::complex<T>>(x),
+          static_cast<thrust::complex<T>>(y)));
 #else
-  return static_cast<standalone::c10::complex<T>>(std::pow(
-      static_cast<std::complex<T>>(x), static_cast<std::complex<T>>(y)));
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
+      std::pow(
+          static_cast<std::complex<T>>(x), static_cast<std::complex<T>>(y)));
 #endif
 }
 
 template <typename T, typename U>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<decltype(T() * U())> pow(
-    const standalone::c10::complex<T>& x,
-    const U& y) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<
+    decltype(T() * U())>
+pow(const executorch::backends::aoti::slim::c10::complex<T>& x, const U& y) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       thrust::pow(static_cast<thrust::complex<T>>(x), y));
 #else
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       std::pow(static_cast<std::complex<T>>(x), y));
 #endif
 }
 
 template <typename T, typename U>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<decltype(T() * U())> pow(
-    const T& x,
-    const standalone::c10::complex<U>& y) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<
+    decltype(T() * U())>
+pow(const T& x, const executorch::backends::aoti::slim::c10::complex<U>& y) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       thrust::pow(x, static_cast<thrust::complex<T>>(y)));
 #else
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       std::pow(x, static_cast<std::complex<T>>(y)));
 #endif
 }
@@ -234,61 +242,61 @@ STANDALONE_HOST_DEVICE inline standalone::c10::complex<decltype(T() * U())> pow(
 // Trigonometric functions
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> sin(
-    const standalone::c10::complex<T>& x) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+sin(const executorch::backends::aoti::slim::c10::complex<T>& x) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       thrust::sin(static_cast<thrust::complex<T>>(x)));
 #else
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       std::sin(static_cast<std::complex<T>>(x)));
 #endif
 }
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> cos(
-    const standalone::c10::complex<T>& x) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+cos(const executorch::backends::aoti::slim::c10::complex<T>& x) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       thrust::cos(static_cast<thrust::complex<T>>(x)));
 #else
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       std::cos(static_cast<std::complex<T>>(x)));
 #endif
 }
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> tan(
-    const standalone::c10::complex<T>& x) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+tan(const executorch::backends::aoti::slim::c10::complex<T>& x) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       thrust::tan(static_cast<thrust::complex<T>>(x)));
 #else
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       std::tan(static_cast<std::complex<T>>(x)));
 #endif
 }
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> asin(
-    const standalone::c10::complex<T>& x) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+asin(const executorch::backends::aoti::slim::c10::complex<T>& x) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       thrust::asin(static_cast<thrust::complex<T>>(x)));
 #else
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       std::asin(static_cast<std::complex<T>>(x)));
 #endif
 }
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> acos(
-    const standalone::c10::complex<T>& x) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+acos(const executorch::backends::aoti::slim::c10::complex<T>& x) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       thrust::acos(static_cast<thrust::complex<T>>(x)));
 #elif !defined(_LIBCPP_VERSION)
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       std::acos(static_cast<std::complex<T>>(x)));
 #else
   return _detail::acos(x);
@@ -296,13 +304,13 @@ STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> acos(
 }
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> atan(
-    const standalone::c10::complex<T>& x) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+atan(const executorch::backends::aoti::slim::c10::complex<T>& x) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       thrust::atan(static_cast<thrust::complex<T>>(x)));
 #else
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       std::atan(static_cast<std::complex<T>>(x)));
 #endif
 }
@@ -310,80 +318,80 @@ STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> atan(
 // Hyperbolic functions
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> sinh(
-    const standalone::c10::complex<T>& x) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+sinh(const executorch::backends::aoti::slim::c10::complex<T>& x) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       thrust::sinh(static_cast<thrust::complex<T>>(x)));
 #else
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       std::sinh(static_cast<std::complex<T>>(x)));
 #endif
 }
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> cosh(
-    const standalone::c10::complex<T>& x) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+cosh(const executorch::backends::aoti::slim::c10::complex<T>& x) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       thrust::cosh(static_cast<thrust::complex<T>>(x)));
 #else
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       std::cosh(static_cast<std::complex<T>>(x)));
 #endif
 }
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> tanh(
-    const standalone::c10::complex<T>& x) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+tanh(const executorch::backends::aoti::slim::c10::complex<T>& x) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       thrust::tanh(static_cast<thrust::complex<T>>(x)));
 #else
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       std::tanh(static_cast<std::complex<T>>(x)));
 #endif
 }
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> asinh(
-    const standalone::c10::complex<T>& x) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+asinh(const executorch::backends::aoti::slim::c10::complex<T>& x) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       thrust::asinh(static_cast<thrust::complex<T>>(x)));
 #else
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       std::asinh(static_cast<std::complex<T>>(x)));
 #endif
 }
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> acosh(
-    const standalone::c10::complex<T>& x) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+acosh(const executorch::backends::aoti::slim::c10::complex<T>& x) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       thrust::acosh(static_cast<thrust::complex<T>>(x)));
 #else
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       std::acosh(static_cast<std::complex<T>>(x)));
 #endif
 }
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> atanh(
-    const standalone::c10::complex<T>& x) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+atanh(const executorch::backends::aoti::slim::c10::complex<T>& x) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       thrust::atanh(static_cast<thrust::complex<T>>(x)));
 #else
-  return static_cast<standalone::c10::complex<T>>(
+  return static_cast<executorch::backends::aoti::slim::c10::complex<T>>(
       std::atanh(static_cast<std::complex<T>>(x)));
 #endif
 }
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> log1p(
-    const standalone::c10::complex<T>& z) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+log1p(const executorch::backends::aoti::slim::c10::complex<T>& z) {
 #if defined(__APPLE__) || defined(__MACOSX) || defined(__CUDACC__) || \
     defined(__HIPCC__)
   // For Mac, the new implementation yielded a high relative error. Falling back
@@ -420,7 +428,7 @@ STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> log1p(
 #else
   // CPU path
   // Based on https://github.com/numpy/numpy/pull/22611#issuecomment-1667945354
-  standalone::c10::complex<T> u = z + T(1);
+  executorch::backends::aoti::slim::c10::complex<T> u = z + T(1);
   if (u == T(1)) {
     return z;
   } else {
@@ -434,8 +442,8 @@ STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> log1p(
 }
 
 template <typename T>
-STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> expm1(
-    const standalone::c10::complex<T>& z) {
+STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex<T>
+expm1(const executorch::backends::aoti::slim::c10::complex<T>& z) {
   // expm1(z) = exp(z) - 1
   // Define z = x + i * y
   // f = e ^ (x + i * y) - 1
@@ -451,50 +459,50 @@ STANDALONE_HOST_DEVICE inline standalone::c10::complex<T> expm1(
   return {er, ei};
 }
 
-} // namespace standalone::c10::complex_math
-
-using standalone::c10::complex_math::acos;
-using standalone::c10::complex_math::acosh;
-using standalone::c10::complex_math::asin;
-using standalone::c10::complex_math::asinh;
-using standalone::c10::complex_math::atan;
-using standalone::c10::complex_math::atanh;
-using standalone::c10::complex_math::cos;
-using standalone::c10::complex_math::cosh;
-using standalone::c10::complex_math::exp;
-using standalone::c10::complex_math::expm1;
-using standalone::c10::complex_math::log;
-using standalone::c10::complex_math::log10;
-using standalone::c10::complex_math::log1p;
-using standalone::c10::complex_math::log2;
-using standalone::c10::complex_math::pow;
-using standalone::c10::complex_math::sin;
-using standalone::c10::complex_math::sinh;
-using standalone::c10::complex_math::sqrt;
-using standalone::c10::complex_math::tan;
-using standalone::c10::complex_math::tanh;
+} // namespace executorch::backends::aoti::slim::c10::complex_math
+
+using executorch::backends::aoti::slim::c10::complex_math::acos;
+using executorch::backends::aoti::slim::c10::complex_math::acosh;
+using executorch::backends::aoti::slim::c10::complex_math::asin;
+using executorch::backends::aoti::slim::c10::complex_math::asinh;
+using executorch::backends::aoti::slim::c10::complex_math::atan;
+using executorch::backends::aoti::slim::c10::complex_math::atanh;
+using executorch::backends::aoti::slim::c10::complex_math::cos;
+using executorch::backends::aoti::slim::c10::complex_math::cosh;
+using executorch::backends::aoti::slim::c10::complex_math::exp;
+using executorch::backends::aoti::slim::c10::complex_math::expm1;
+using executorch::backends::aoti::slim::c10::complex_math::log;
+using executorch::backends::aoti::slim::c10::complex_math::log10;
+using executorch::backends::aoti::slim::c10::complex_math::log1p;
+using executorch::backends::aoti::slim::c10::complex_math::log2;
+using executorch::backends::aoti::slim::c10::complex_math::pow;
+using executorch::backends::aoti::slim::c10::complex_math::sin;
+using executorch::backends::aoti::slim::c10::complex_math::sinh;
+using executorch::backends::aoti::slim::c10::complex_math::sqrt;
+using executorch::backends::aoti::slim::c10::complex_math::tan;
+using executorch::backends::aoti::slim::c10::complex_math::tanh;
 
 namespace std {
 
-using standalone::c10::complex_math::acos;
-using standalone::c10::complex_math::acosh;
-using standalone::c10::complex_math::asin;
-using standalone::c10::complex_math::asinh;
-using standalone::c10::complex_math::atan;
-using standalone::c10::complex_math::atanh;
-using standalone::c10::complex_math::cos;
-using standalone::c10::complex_math::cosh;
-using standalone::c10::complex_math::exp;
-using standalone::c10::complex_math::expm1;
-using standalone::c10::complex_math::log;
-using standalone::c10::complex_math::log10;
-using standalone::c10::complex_math::log1p;
-using standalone::c10::complex_math::log2;
-using standalone::c10::complex_math::pow;
-using standalone::c10::complex_math::sin;
-using standalone::c10::complex_math::sinh;
-using standalone::c10::complex_math::sqrt;
-using standalone::c10::complex_math::tan;
-using standalone::c10::complex_math::tanh;
+using executorch::backends::aoti::slim::c10::complex_math::acos;
+using executorch::backends::aoti::slim::c10::complex_math::acosh;
+using executorch::backends::aoti::slim::c10::complex_math::asin;
+using executorch::backends::aoti::slim::c10::complex_math::asinh;
+using executorch::backends::aoti::slim::c10::complex_math::atan;
+using executorch::backends::aoti::slim::c10::complex_math::atanh;
+using executorch::backends::aoti::slim::c10::complex_math::cos;
+using executorch::backends::aoti::slim::c10::complex_math::cosh;
+using executorch::backends::aoti::slim::c10::complex_math::exp;
+using executorch::backends::aoti::slim::c10::complex_math::expm1;
+using executorch::backends::aoti::slim::c10::complex_math::log;
+using executorch::backends::aoti::slim::c10::complex_math::log10;
+using executorch::backends::aoti::slim::c10::complex_math::log1p;
+using executorch::backends::aoti::slim::c10::complex_math::log2;
+using executorch::backends::aoti::slim::c10::complex_math::pow;
+using executorch::backends::aoti::slim::c10::complex_math::sin;
+using executorch::backends::aoti::slim::c10::complex_math::sinh;
+using executorch::backends::aoti::slim::c10::complex_math::sqrt;
+using executorch::backends::aoti::slim::c10::complex_math::tan;
+using executorch::backends::aoti::slim::c10::complex_math::tanh;
 
 } // namespace std
diff --git a/backends/aoti/slim/c10/util/complex_utils.h b/backends/aoti/slim/c10/util/complex_utils.h
index 5b29406a186..af6d8203c65 100644
--- a/backends/aoti/slim/c10/util/complex_utils.h
+++ b/backends/aoti/slim/c10/util/complex_utils.h
@@ -5,7 +5,7 @@
 
 #include <limits>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 template <typename T>
 struct is_complex : public std::false_type {};
@@ -14,7 +14,8 @@ template <typename T>
 struct is_complex<std::complex<T>> : public std::true_type {};
 
 template <typename T>
-struct is_complex<standalone::c10::complex<T>> : public std::true_type {};
+struct is_complex<executorch::backends::aoti::slim::c10::complex<T>>
+    : public std::true_type {};
 
 // Extract double from std::complex<double>; is identity otherwise
 // TODO: Write in more idiomatic C++17
@@ -27,19 +28,20 @@ struct scalar_value_type<std::complex<T>> {
   using type = T;
 };
 template <typename T>
-struct scalar_value_type<standalone::c10::complex<T>> {
+struct scalar_value_type<executorch::backends::aoti::slim::c10::complex<T>> {
   using type = T;
 };
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
 
 namespace std {
 
 template <typename T>
-class numeric_limits<standalone::c10::complex<T>> : public numeric_limits<T> {};
+class numeric_limits<executorch::backends::aoti::slim::c10::complex<T>>
+    : public numeric_limits<T> {};
 
 template <typename T>
-bool isnan(const standalone::c10::complex<T>& v) {
+bool isnan(const executorch::backends::aoti::slim::c10::complex<T>& v) {
   return std::isnan(v.real()) || std::isnan(v.imag());
 }
 
diff --git a/backends/aoti/slim/c10/util/copysign.h b/backends/aoti/slim/c10/util/copysign.h
index 1012934049c..ff0b0fcc847 100644
--- a/backends/aoti/slim/c10/util/copysign.h
+++ b/backends/aoti/slim/c10/util/copysign.h
@@ -3,7 +3,7 @@
 #include <executorch/backends/aoti/slim/c10/util/BFloat16.h>
 #include <executorch/backends/aoti/slim/c10/util/Half.h>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 // Note: Explicit implementation of copysign for Half and BFloat16
 // is needed to workaround g++-7/8 crash on aarch64, but also makes
@@ -23,4 +23,4 @@ inline BFloat16 copysign(BFloat16 a, BFloat16 b) {
   return BFloat16((a.x & 0x7fff) | (b.x & 0x8000), BFloat16::from_bits());
 }
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/util/floating_point_utils.h b/backends/aoti/slim/c10/util/floating_point_utils.h
index 259cb93b0a5..dbe208b05b9 100644
--- a/backends/aoti/slim/c10/util/floating_point_utils.h
+++ b/backends/aoti/slim/c10/util/floating_point_utils.h
@@ -4,7 +4,7 @@
 #include <executorch/backends/aoti/slim/c10/util/bit_cast.h>
 #include <cstdint>
 
-namespace standalone::c10::detail {
+namespace executorch::backends::aoti::slim::c10::detail {
 
 STANDALONE_HOST_DEVICE inline float fp32_from_bits(uint32_t w) {
 #if defined(__OPENCL_VERSION__)
@@ -14,7 +14,7 @@ STANDALONE_HOST_DEVICE inline float fp32_from_bits(uint32_t w) {
 #elif defined(__INTEL_COMPILER)
   return _castu32_f32(w);
 #else
-  return standalone::c10::bit_cast<float>(w);
+  return executorch::backends::aoti::slim::c10::bit_cast<float>(w);
 #endif
 }
 
@@ -26,8 +26,8 @@ STANDALONE_HOST_DEVICE inline uint32_t fp32_to_bits(float f) {
 #elif defined(__INTEL_COMPILER)
   return _castf32_u32(f);
 #else
-  return standalone::c10::bit_cast<uint32_t>(f);
+  return executorch::backends::aoti::slim::c10::bit_cast<uint32_t>(f);
 #endif
 }
 
-} // namespace standalone::c10::detail
+} // namespace executorch::backends::aoti::slim::c10::detail
diff --git a/backends/aoti/slim/c10/util/generic_math.h b/backends/aoti/slim/c10/util/generic_math.h
index 00bb4265d9d..6cc9ec72bec 100644
--- a/backends/aoti/slim/c10/util/generic_math.h
+++ b/backends/aoti/slim/c10/util/generic_math.h
@@ -6,20 +6,23 @@
 
 #if defined(__CUDA_ARCH__)
 #include <executorch/backends/aoti/slim/c10/cuda/CUDAMathCompat.h>
-#define STANDALONE_COMPAT_COPYSIGN standalone::c10::cuda::compat::copysign
+#define STANDALONE_COMPAT_COPYSIGN \
+  executorch::backends::aoti::slim::c10::cuda::compat::copysign
 // TODO: rocm is not supported yet
 // #elif defined(__HIPCC__)
 // #include <executorch/backends/aoti/slim/hip/HIPMathCompat.h>
-// #define STANDALONE_COMPAT_COPYSIGN standalone::c10::hip::compat::copysign
+// #define STANDALONE_COMPAT_COPYSIGN
+// executorch::backends::aoti::slim::c10::hip::compat::copysign
 #else
 #include <executorch/backends/aoti/slim/c10/util/copysign.h>
-#define STANDALONE_COMPAT_COPYSIGN standalone::c10::copysign
+#define STANDALONE_COMPAT_COPYSIGN \
+  executorch::backends::aoti::slim::c10::copysign
 #endif
 
 // The functions in this file should be header-only as it is used under
 // ABI-compatibility mode.
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 // NOTE: [Floor Division in Python]
 // Python's __floordiv__ operator is more complicated than just floor(a / b).
@@ -61,7 +64,7 @@ inline STANDALONE_HOST_DEVICE scalar_t div_floor_floating(
 template <typename scalar_t>
 inline STANDALONE_HOST_DEVICE scalar_t
 div_floor_integer(scalar_t a, scalar_t b) {
-  if (standalone::c10::signs_differ(a, b)) {
+  if (executorch::backends::aoti::slim::c10::signs_differ(a, b)) {
     // Subtracts one from the results of truncation division if the
     // divisor and dividend have different sign(bit)s and the remainder of
     // the division is nonzero
@@ -102,4 +105,4 @@ inline STANDALONE_HOST_DEVICE scalar_t div_mod(scalar_t a, scalar_t b) {
   return mod;
 }
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/util/irange.h b/backends/aoti/slim/c10/util/irange.h
index 0d10f373a04..75c8b48d1ca 100644
--- a/backends/aoti/slim/c10/util/irange.h
+++ b/backends/aoti/slim/c10/util/irange.h
@@ -9,7 +9,7 @@
 #include <iterator>
 #include <type_traits>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 namespace detail {
 
@@ -48,9 +48,9 @@ struct integer_iterator {
   constexpr bool operator==(const integer_iterator& other) const {
     if constexpr (one_sided) {
       // Range-for loops' end test is `begin != end`, not `begin <
-      // end`. To handle `standalone::c10::irange(n)` where n < 0 (which
-      // should be empty), we just make `begin != end` fail whenever `end` is
-      // negative.
+      // end`. To handle `executorch::backends::aoti::slim::c10::irange(n)`
+      // where n < 0 (which should be empty), we just make `begin != end` fail
+      // whenever `end` is negative.
       return is_negative(other.value) || value == other.value;
     } else {
       return value == other.value;
@@ -120,4 +120,4 @@ constexpr integer_range<Integer, true> irange(Integer end) {
   return {Integer(), end};
 }
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/util/llvmMathExtras.h b/backends/aoti/slim/c10/util/llvmMathExtras.h
index 0b4f92c44c6..a42423d009d 100644
--- a/backends/aoti/slim/c10/util/llvmMathExtras.h
+++ b/backends/aoti/slim/c10/util/llvmMathExtras.h
@@ -56,7 +56,7 @@ unsigned char _BitScanReverse64(unsigned long* _Index, unsigned __int64 _Mask);
 }
 #endif
 
-namespace standalone::c10::llvm {
+namespace executorch::backends::aoti::slim::c10::llvm {
 /// The behavior an operation has on an input of 0.
 enum ZeroBehavior {
   /// The returned value is undefined.
@@ -620,7 +620,7 @@ inline double BitsToDouble(uint64_t Bits) {
 /// This function takes a 32-bit integer and returns the bit equivalent float.
 inline float BitsToFloat(uint32_t Bits) {
   // TODO: Use std::bit_cast once C++20 becomes available.
-  return standalone::c10::bit_cast<float>(Bits);
+  return executorch::backends::aoti::slim::c10::bit_cast<float>(Bits);
 }
 
 /// This function takes a double and returns the bit equivalent 64-bit integer.
@@ -896,4 +896,4 @@ SaturatingMultiplyAdd(T X, T Y, T A, bool* ResultOverflowed = nullptr) {
 
 /// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
 extern const float huge_valf;
-} // namespace standalone::c10::llvm
+} // namespace executorch::backends::aoti::slim::c10::llvm
diff --git a/backends/aoti/slim/c10/util/overflows.h b/backends/aoti/slim/c10/util/overflows.h
index 5f636cd1a75..df2502d7910 100644
--- a/backends/aoti/slim/c10/util/overflows.h
+++ b/backends/aoti/slim/c10/util/overflows.h
@@ -8,7 +8,7 @@
 #include <limits>
 #include <type_traits>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 // In some versions of MSVC, there will be a compiler error when building.
 // C4146: unary minus operator applied to unsigned type, result still unsigned
 // C4804: unsafe use of type 'bool' in operation
@@ -50,11 +50,12 @@ overflows(From f, bool strict_unsigned = false) {
     // `a + 255 * b`.
     if (!strict_unsigned) {
       return greater_than_max<To>(f) ||
-          (standalone::c10::is_negative(f) &&
+          (executorch::backends::aoti::slim::c10::is_negative(f) &&
            -static_cast<uint64_t>(f) > static_cast<uint64_t>(limit::max()));
     }
   }
-  return standalone::c10::less_than_lowest<To>(f) || greater_than_max<To>(f);
+  return executorch::backends::aoti::slim::c10::less_than_lowest<To>(f) ||
+      greater_than_max<To>(f);
 }
 
 template <typename To, typename From>
@@ -97,4 +98,4 @@ std::enable_if_t<is_complex<From>::value, bool> overflows(
              typename scalar_value_type<To>::type,
              typename From::value_type>(f.imag(), strict_unsigned);
 }
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/util/qint32.h b/backends/aoti/slim/c10/util/qint32.h
index 7951bfd240a..2d3f72e9a10 100644
--- a/backends/aoti/slim/c10/util/qint32.h
+++ b/backends/aoti/slim/c10/util/qint32.h
@@ -3,7 +3,7 @@
 
 #include <executorch/backends/aoti/slim/c10/macros/Macros.h>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 /**
  * qint32 is for signed 32 bit quantized Tensors
@@ -15,4 +15,4 @@ struct alignas(4) qint32 {
   STANDALONE_HOST_DEVICE explicit qint32(int32_t val) : val_(val) {}
 };
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/util/qint8.h b/backends/aoti/slim/c10/util/qint8.h
index 53c1fdf465a..f08ce5bfc3f 100644
--- a/backends/aoti/slim/c10/util/qint8.h
+++ b/backends/aoti/slim/c10/util/qint8.h
@@ -3,7 +3,7 @@
 
 #include <executorch/backends/aoti/slim/c10/macros/Macros.h>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 /**
  * This is the data type for quantized Tensors. Right now we only have
@@ -17,4 +17,4 @@ struct alignas(1) qint8 {
   STANDALONE_HOST_DEVICE explicit qint8(int8_t val) : val_(val) {}
 };
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/util/quint2x4.h b/backends/aoti/slim/c10/util/quint2x4.h
index 009802be7f2..e80848cd9eb 100644
--- a/backends/aoti/slim/c10/util/quint2x4.h
+++ b/backends/aoti/slim/c10/util/quint2x4.h
@@ -3,7 +3,7 @@
 
 #include <executorch/backends/aoti/slim/c10/macros/Macros.h>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 /**
  * quint2x4 is for un-signed 2 bit quantized Tensors that are packed to byte
@@ -16,4 +16,4 @@ struct alignas(1) quint2x4 {
   STANDALONE_HOST_DEVICE explicit quint2x4(uint8_t val) : val_(val) {}
 };
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/util/quint4x2.h b/backends/aoti/slim/c10/util/quint4x2.h
index b6812ab8fde..1c2f8350596 100644
--- a/backends/aoti/slim/c10/util/quint4x2.h
+++ b/backends/aoti/slim/c10/util/quint4x2.h
@@ -3,7 +3,7 @@
 
 #include <executorch/backends/aoti/slim/c10/macros/Macros.h>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 /**
  * quint4x2 is for un-signed 4 bit quantized Tensors that are packed to byte
@@ -16,4 +16,4 @@ struct alignas(1) quint4x2 {
   STANDALONE_HOST_DEVICE explicit quint4x2(uint8_t val) : val_(val) {}
 };
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/util/quint8.h b/backends/aoti/slim/c10/util/quint8.h
index 4019765ca4a..e8649bc4fa8 100644
--- a/backends/aoti/slim/c10/util/quint8.h
+++ b/backends/aoti/slim/c10/util/quint8.h
@@ -3,7 +3,7 @@
 
 #include <executorch/backends/aoti/slim/c10/macros/Macros.h>
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 /**
  * quint8 is for unsigned 8 bit quantized Tensors
@@ -15,4 +15,4 @@ struct alignas(1) quint8 {
   STANDALONE_HOST_DEVICE explicit quint8(uint8_t val) : val_(val) {}
 };
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/c10/util/safe_numerics.h b/backends/aoti/slim/c10/util/safe_numerics.h
index 26a05c636aa..df0aa6e7c5c 100644
--- a/backends/aoti/slim/c10/util/safe_numerics.h
+++ b/backends/aoti/slim/c10/util/safe_numerics.h
@@ -12,7 +12,7 @@
 #define STANDALONE_HAS_BUILTIN_OVERFLOW() (1)
 #endif
 
-namespace standalone::c10 {
+namespace executorch::backends::aoti::slim::c10 {
 
 STANDALONE_ALWAYS_INLINE bool
 add_overflows(uint64_t a, uint64_t b, uint64_t* out) {
@@ -40,8 +40,8 @@ mul_overflows(uint64_t a, uint64_t b, uint64_t* out) {
   *out = a * b;
   // This test isnt exact, but avoids doing integer division
   return (
-      (standalone::c10::llvm::countLeadingZeros(a) +
-       standalone::c10::llvm::countLeadingZeros(b)) < 64);
+      (executorch::backends::aoti::slim::c10::llvm::countLeadingZeros(a) +
+       executorch::backends::aoti::slim::c10::llvm::countLeadingZeros(b)) < 64);
 #endif
 }
 
@@ -65,7 +65,8 @@ bool safe_multiplies_u64(It first, It last, uint64_t* out) {
   uint64_t prod = 1;
   bool overflow = false;
   for (; first != last; ++first) {
-    overflow |= standalone::c10::mul_overflows(prod, *first, &prod);
+    overflow |= executorch::backends::aoti::slim::c10::mul_overflows(
+        prod, *first, &prod);
   }
   *out = prod;
   return overflow;
@@ -78,7 +79,7 @@ bool safe_multiplies_u64(It first, It last, uint64_t* out) {
     prod *= x;
     // log2(0) isn't valid, so need to track it specially
     is_zero |= (x == 0);
-    prod_log2 += standalone::c10::llvm::Log2_64_Ceil(x);
+    prod_log2 += executorch::backends::aoti::slim::c10::llvm::Log2_64_Ceil(x);
   }
   *out = prod;
   // This test isnt exact, but avoids doing integer division
@@ -91,4 +92,4 @@ bool safe_multiplies_u64(const Container& c, uint64_t* out) {
   return safe_multiplies_u64(c.begin(), c.end(), out);
 }
 
-} // namespace standalone::c10
+} // namespace executorch::backends::aoti::slim::c10
diff --git a/backends/aoti/slim/core/SlimTensor.h b/backends/aoti/slim/core/SlimTensor.h
index 69ac4fec65f..9021e2db922 100644
--- a/backends/aoti/slim/core/SlimTensor.h
+++ b/backends/aoti/slim/core/SlimTensor.h
@@ -18,15 +18,15 @@
 #include <executorch/backends/aoti/slim/core/Storage.h>
 #include <executorch/backends/aoti/slim/util/SizeUtil.h>
 
-namespace standalone::slim {
+namespace executorch::backends::aoti::slim {
 
 class SlimTensor {
  public:
   SlimTensor(
       Storage&& storage,
-      standalone::c10::IntArrayRef sizes,
-      standalone::c10::IntArrayRef strides,
-      standalone::c10::ScalarType dtype,
+      executorch::backends::aoti::slim::c10::IntArrayRef sizes,
+      executorch::backends::aoti::slim::c10::IntArrayRef strides,
+      executorch::backends::aoti::slim::c10::ScalarType dtype,
       int64_t storage_offset = 0)
       : storage_(std::move(storage)),
         storage_offset_(storage_offset),
@@ -39,7 +39,7 @@ class SlimTensor {
       : storage_(Storage()),
         storage_offset_(0),
         numel_(0),
-        dtype_(standalone::c10::ScalarType::Float),
+        dtype_(executorch::backends::aoti::slim::c10::ScalarType::Float),
         is_contiguous_(true) {
     sizes_and_strides_.set_sizes({0});
     sizes_and_strides_.set_strides({1});
@@ -67,42 +67,42 @@ class SlimTensor {
   }
 
   size_t itemsize() const {
-    return standalone::c10::elementSize(dtype_);
+    return executorch::backends::aoti::slim::c10::elementSize(dtype_);
   }
 
-  standalone::c10::IntArrayRef sizes() const {
+  executorch::backends::aoti::slim::c10::IntArrayRef sizes() const {
     return sizes_and_strides_.sizes_arrayref();
   }
 
   int64_t size(int64_t dim) const {
-    int64_t wrapped_dim =
-        standalone::c10::maybe_wrap_dim(dim, static_cast<int64_t>(this->dim()));
+    int64_t wrapped_dim = executorch::backends::aoti::slim::c10::maybe_wrap_dim(
+        dim, static_cast<int64_t>(this->dim()));
     return sizes_and_strides_.size_at(static_cast<size_t>(wrapped_dim));
   }
 
-  standalone::c10::IntArrayRef strides() const {
+  executorch::backends::aoti::slim::c10::IntArrayRef strides() const {
     return sizes_and_strides_.strides_arrayref();
   }
 
   int64_t stride(int64_t dim) const {
-    int64_t wrapped_dim =
-        standalone::c10::maybe_wrap_dim(dim, static_cast<int64_t>(this->dim()));
+    int64_t wrapped_dim = executorch::backends::aoti::slim::c10::maybe_wrap_dim(
+        dim, static_cast<int64_t>(this->dim()));
     return sizes_and_strides_.stride_at(static_cast<size_t>(wrapped_dim));
   }
 
-  standalone::c10::ScalarType dtype() const {
+  executorch::backends::aoti::slim::c10::ScalarType dtype() const {
     return dtype_;
   }
 
-  const standalone::c10::Device& device() const {
+  const executorch::backends::aoti::slim::c10::Device& device() const {
     return storage_->device();
   }
 
-  standalone::c10::DeviceType device_type() const {
+  executorch::backends::aoti::slim::c10::DeviceType device_type() const {
     return storage_->device().type();
   }
 
-  standalone::c10::DeviceIndex device_index() const {
+  executorch::backends::aoti::slim::c10::DeviceIndex device_index() const {
     return storage_->device().index();
   }
 
@@ -149,8 +149,8 @@ class SlimTensor {
   }
 
   void set_sizes_and_strides(
-      standalone::c10::IntArrayRef sizes,
-      standalone::c10::IntArrayRef strides,
+      executorch::backends::aoti::slim::c10::IntArrayRef sizes,
+      executorch::backends::aoti::slim::c10::IntArrayRef strides,
       std::optional<int64_t> storage_offset = std::nullopt) {
     const int64_t new_dim = static_cast<int64_t>(sizes.size());
     STANDALONE_CHECK(
@@ -175,7 +175,7 @@ class SlimTensor {
           if (dim == new_dim - 1) {
             new_strides[dim] = 1;
           } else {
-            overflowed |= standalone::c10::mul_overflows(
+            overflowed |= executorch::backends::aoti::slim::c10::mul_overflows(
                 new_strides[dim + 1],
                 std::max<int64_t>(new_sizes[dim + 1], 1),
                 &new_strides[dim]);
@@ -195,20 +195,24 @@ class SlimTensor {
     refresh_contiguous();
   }
 
-  void set_sizes_contiguous(standalone::c10::IntArrayRef new_size) {
+  void set_sizes_contiguous(
+      executorch::backends::aoti::slim::c10::IntArrayRef new_size) {
     sizes_and_strides_.set_sizes(new_size);
     refresh_numel();
-    empty_tensor_restride(standalone::c10::MemoryFormat::Contiguous);
+    empty_tensor_restride(
+        executorch::backends::aoti::slim::c10::MemoryFormat::Contiguous);
   }
 
-  void empty_tensor_restride(standalone::c10::MemoryFormat memory_format);
+  void empty_tensor_restride(
+      executorch::backends::aoti::slim::c10::MemoryFormat memory_format);
 
   SlimTensor resize_(
-      standalone::c10::IntArrayRef sizes,
+      executorch::backends::aoti::slim::c10::IntArrayRef sizes,
       std::optional<c10::MemoryFormat> optional_memory_format);
 
   // Conversion operations
-  SlimTensor to(const standalone::c10::Device& device) const {
+  SlimTensor to(
+      const executorch::backends::aoti::slim::c10::Device& device) const {
     if (device == storage_->device()) {
       return *this;
     }
@@ -230,7 +234,7 @@ class SlimTensor {
     return to(DEFAULT_CUDA_DEVICE);
   }
 
-  SlimTensor to(standalone::c10::ScalarType dtype) const {
+  SlimTensor to(executorch::backends::aoti::slim::c10::ScalarType dtype) const {
     STANDALONE_CHECK(false, "TBD: to(dtype)");
   }
 
@@ -252,7 +256,8 @@ class SlimTensor {
 
     // Case 2: At least one tensor is non-contiguous, perform element-wise copy
     // that respects both source and destination strides.
-    const size_t elem_size = standalone::c10::elementSize(dtype_);
+    const size_t elem_size =
+        executorch::backends::aoti::slim::c10::elementSize(dtype_);
     char* dst_data = static_cast<char*>(this->data_ptr());
     const char* src_data = static_cast<const char*>(other.data_ptr());
 
@@ -372,7 +377,8 @@ class SlimTensor {
           }
         } else {
           // Handle non-contiguous tensors by respecting strides
-          const size_t elem_size = standalone::c10::elementSize(this->dtype_);
+          const size_t elem_size =
+              executorch::backends::aoti::slim::c10::elementSize(this->dtype_);
           char* base_data = static_cast<char*>(this->data_ptr());
 
           std::vector<int64_t> counter(this->dim(), 0);
@@ -403,41 +409,43 @@ class SlimTensor {
     };
 
     switch (this->dtype()) {
-      case standalone::c10::ScalarType::Double:
+      case executorch::backends::aoti::slim::c10::ScalarType::Double:
         fill_value(value.to<double>());
         break;
-      case standalone::c10::ScalarType::Float:
+      case executorch::backends::aoti::slim::c10::ScalarType::Float:
         fill_value(value.to<float>());
         break;
-      case standalone::c10::ScalarType::Half:
-        fill_value(value.to<standalone::c10::Half>());
+      case executorch::backends::aoti::slim::c10::ScalarType::Half:
+        fill_value(value.to<executorch::backends::aoti::slim::c10::Half>());
         break;
-      case standalone::c10::ScalarType::BFloat16:
-        fill_value(value.to<standalone::c10::BFloat16>());
+      case executorch::backends::aoti::slim::c10::ScalarType::BFloat16:
+        fill_value(value.to<executorch::backends::aoti::slim::c10::BFloat16>());
         break;
-      case standalone::c10::ScalarType::Long:
+      case executorch::backends::aoti::slim::c10::ScalarType::Long:
         fill_value(value.to<int64_t>());
         break;
-      case standalone::c10::ScalarType::Int:
+      case executorch::backends::aoti::slim::c10::ScalarType::Int:
         fill_value(value.to<int32_t>());
         break;
-      case standalone::c10::ScalarType::Short:
+      case executorch::backends::aoti::slim::c10::ScalarType::Short:
         fill_value(value.to<int16_t>());
         break;
-      case standalone::c10::ScalarType::Char:
+      case executorch::backends::aoti::slim::c10::ScalarType::Char:
         fill_value(value.to<int8_t>());
         break;
-      case standalone::c10::ScalarType::Byte:
+      case executorch::backends::aoti::slim::c10::ScalarType::Byte:
         fill_value(value.to<uint8_t>());
         break;
-      case standalone::c10::ScalarType::Bool:
+      case executorch::backends::aoti::slim::c10::ScalarType::Bool:
         fill_value(value.to<bool>());
         break;
-      case standalone::c10::ScalarType::ComplexFloat:
-        fill_value(value.to<standalone::c10::complex<float>>());
+      case executorch::backends::aoti::slim::c10::ScalarType::ComplexFloat:
+        fill_value(
+            value.to<executorch::backends::aoti::slim::c10::complex<float>>());
         break;
-      case standalone::c10::ScalarType::ComplexDouble:
-        fill_value(value.to<standalone::c10::complex<double>>());
+      case executorch::backends::aoti::slim::c10::ScalarType::ComplexDouble:
+        fill_value(
+            value.to<executorch::backends::aoti::slim::c10::complex<double>>());
         break;
       default:
         STANDALONE_CHECK(false, "fill_: Unsupported dtype");
@@ -452,34 +460,38 @@ class SlimTensor {
 
   SlimTensor clone_contiguous() const {
     std::vector<int64_t> contig_strides =
-        standalone::slim::compute_contiguous_strides(this->sizes());
+        executorch::backends::aoti::slim::compute_contiguous_strides(
+            this->sizes());
     return _clone_impl(
         this->sizes(), contig_strides, this->dtype(), this->device());
   }
 
   // View operations
   SlimTensor as_strided(
-      standalone::c10::IntArrayRef sizes,
-      standalone::c10::IntArrayRef strides,
+      executorch::backends::aoti::slim::c10::IntArrayRef sizes,
+      executorch::backends::aoti::slim::c10::IntArrayRef strides,
       int64_t storage_offset) const;
   SlimTensor as_strided_(
-      standalone::c10::IntArrayRef sizes,
-      standalone::c10::IntArrayRef strides,
+      executorch::backends::aoti::slim::c10::IntArrayRef sizes,
+      executorch::backends::aoti::slim::c10::IntArrayRef strides,
       int64_t storage_offset);
 
-  SlimTensor permute(standalone::c10::IntArrayRef dims) const;
+  SlimTensor permute(
+      executorch::backends::aoti::slim::c10::IntArrayRef dims) const;
 
   // Transpose operations
   SlimTensor transpose() const;
   SlimTensor transpose(int64_t dim0, int64_t dim1) const;
   SlimTensor t() const;
 
-  SlimTensor reshape(standalone::c10::IntArrayRef proposed_shape) const;
+  SlimTensor reshape(
+      executorch::backends::aoti::slim::c10::IntArrayRef proposed_shape) const;
 
   SlimTensor narrow(int64_t dim, int64_t start, int64_t length) const;
 
   // Generic element access returning SlimTensor
-  SlimTensor operator[](standalone::c10::IntArrayRef indices) const {
+  SlimTensor operator[](
+      executorch::backends::aoti::slim::c10::IntArrayRef indices) const {
     STANDALONE_CHECK(
         indices.size() <= this->dim(),
         "Number of indices (",
@@ -494,7 +506,7 @@ class SlimTensor {
       for (size_t i = 0; i < indices.size(); ++i) {
         int64_t idx = indices[i];
         int64_t size = this->size(i);
-        idx = standalone::c10::maybe_wrap_dim(idx, size);
+        idx = executorch::backends::aoti::slim::c10::maybe_wrap_dim(idx, size);
         linear_index += idx * this->stride(i);
       }
       // Create 0-dimensional tensor pointing to the indexed element
@@ -511,7 +523,7 @@ class SlimTensor {
       for (size_t i = 0; i < indices.size(); ++i) {
         int64_t idx = indices[i];
         int64_t size = this->size(i);
-        idx = standalone::c10::maybe_wrap_dim(idx, size);
+        idx = executorch::backends::aoti::slim::c10::maybe_wrap_dim(idx, size);
         offset_adjustment += idx * this->stride(i);
       }
 
@@ -533,41 +545,43 @@ class SlimTensor {
 
   // Convenience overload for single index
   SlimTensor operator[](int64_t index) const {
-    return (*this)[standalone::c10::IntArrayRef{index}];
+    return (*this)[executorch::backends::aoti::slim::c10::IntArrayRef{index}];
   }
 
   // Convenience overloads for common multi-dimensional cases
   SlimTensor operator[](std::initializer_list<int64_t> indices) const {
-    return (*this)[standalone::c10::IntArrayRef(indices)];
+    return (*this)[executorch::backends::aoti::slim::c10::IntArrayRef(indices)];
   }
 
   // Extract scalar value from 0-dimensional tensor
-  standalone::c10::Scalar item() const {
+  executorch::backends::aoti::slim::c10::Scalar item() const {
     switch (this->dtype()) {
-      case standalone::c10::ScalarType::Double:
+      case executorch::backends::aoti::slim::c10::ScalarType::Double:
         return this->item<double>();
-      case standalone::c10::ScalarType::Float:
+      case executorch::backends::aoti::slim::c10::ScalarType::Float:
         return this->item<float>();
-      case standalone::c10::ScalarType::Half:
-        return this->item<standalone::c10::Half>();
-      case standalone::c10::ScalarType::BFloat16:
-        return this->item<standalone::c10::BFloat16>();
-      case standalone::c10::ScalarType::Long:
+      case executorch::backends::aoti::slim::c10::ScalarType::Half:
+        return this->item<executorch::backends::aoti::slim::c10::Half>();
+      case executorch::backends::aoti::slim::c10::ScalarType::BFloat16:
+        return this->item<executorch::backends::aoti::slim::c10::BFloat16>();
+      case executorch::backends::aoti::slim::c10::ScalarType::Long:
         return this->item<int64_t>();
-      case standalone::c10::ScalarType::Int:
+      case executorch::backends::aoti::slim::c10::ScalarType::Int:
         return this->item<int32_t>();
-      case standalone::c10::ScalarType::Short:
+      case executorch::backends::aoti::slim::c10::ScalarType::Short:
         return this->item<int16_t>();
-      case standalone::c10::ScalarType::Char:
+      case executorch::backends::aoti::slim::c10::ScalarType::Char:
         return this->item<int8_t>();
-      case standalone::c10::ScalarType::Byte:
+      case executorch::backends::aoti::slim::c10::ScalarType::Byte:
         return this->item<uint8_t>();
-      case standalone::c10::ScalarType::Bool:
+      case executorch::backends::aoti::slim::c10::ScalarType::Bool:
         return this->item<bool>();
-      case standalone::c10::ScalarType::ComplexFloat:
-        return this->item<standalone::c10::complex<float>>();
-      case standalone::c10::ScalarType::ComplexDouble:
-        return this->item<standalone::c10::complex<double>>();
+      case executorch::backends::aoti::slim::c10::ScalarType::ComplexFloat:
+        return this
+            ->item<executorch::backends::aoti::slim::c10::complex<float>>();
+      case executorch::backends::aoti::slim::c10::ScalarType::ComplexDouble:
+        return this
+            ->item<executorch::backends::aoti::slim::c10::complex<double>>();
       default:
         STANDALONE_CHECK(false, "item(): Unsupported dtype");
     }
@@ -589,10 +603,10 @@ class SlimTensor {
 
  private:
   SlimTensor _clone_impl(
-      standalone::c10::IntArrayRef sizes,
-      standalone::c10::IntArrayRef strides,
-      standalone::c10::ScalarType dtype,
-      const standalone::c10::Device& device) const {
+      executorch::backends::aoti::slim::c10::IntArrayRef sizes,
+      executorch::backends::aoti::slim::c10::IntArrayRef strides,
+      executorch::backends::aoti::slim::c10::ScalarType dtype,
+      const executorch::backends::aoti::slim::c10::Device& device) const {
     Storage storage = new_storage(sizes, strides, dtype, device);
     SlimTensor result =
         SlimTensor(std::move(storage), sizes, strides, dtype, 0);
@@ -605,7 +619,7 @@ class SlimTensor {
   }
 
   bool compute_is_contiguous() const {
-    return standalone::c10::_compute_contiguous<int64_t>(
+    return executorch::backends::aoti::slim::c10::_compute_contiguous<int64_t>(
         sizes_and_strides_.sizes_arrayref(),
         sizes_and_strides_.strides_arrayref(),
         numel_);
@@ -619,19 +633,27 @@ class SlimTensor {
 
   Storage storage_; // device_type_ and device_index_ are stored in storage_
   int64_t storage_offset_{0};
-  standalone::c10::SizesAndStrides sizes_and_strides_;
+  executorch::backends::aoti::slim::c10::SizesAndStrides sizes_and_strides_;
   // If sizes and strides are empty, the numel is 1!!  However, most of the
   // time, we will immediately set sizes to {0} and reset numel to 0.
   // (Can't do that in the default initializers, because there's no way to
   // spell "allocate a one-element array" for strides_).
   size_t numel_{1};
-  standalone::c10::ScalarType dtype_;
+  executorch::backends::aoti::slim::c10::ScalarType dtype_;
   bool is_contiguous_{true};
   // NOLINTNEXTLINE(clang-diagnostic-unused-private-field)
   std::array<int8_t, 6> reserved_{0}; // padding to align to 8 bytes
 };
 
-} // namespace standalone::slim
+} // namespace executorch::backends::aoti::slim
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::backends::aoti::slim::SlimTensor;
+} // namespace executor
+} // namespace torch
 
 #include <executorch/backends/aoti/slim/core/SlimTensorResize-incl.h>
 #include <executorch/backends/aoti/slim/core/SlimTensorView-incl.h>
diff --git a/backends/aoti/slim/core/SlimTensorResize-incl.h b/backends/aoti/slim/core/SlimTensorResize-incl.h
index e9de9f5e0a6..136d4821e74 100644
--- a/backends/aoti/slim/core/SlimTensorResize-incl.h
+++ b/backends/aoti/slim/core/SlimTensorResize-incl.h
@@ -6,9 +6,9 @@
 #include <executorch/backends/aoti/slim/core/Storage.h>
 #include <executorch/backends/aoti/slim/util/SizeUtil.h>
 
-namespace standalone::slim {
+namespace executorch::backends::aoti::slim {
 inline void SlimTensor::empty_tensor_restride(
-    standalone::c10::MemoryFormat memory_format) {
+    executorch::backends::aoti::slim::c10::MemoryFormat memory_format) {
 #ifdef DEBUG
   STANDALONE_INTERNAL_ASSERT(
       compute_numel() == numel_,
@@ -16,7 +16,7 @@ inline void SlimTensor::empty_tensor_restride(
       "called before setting correct numel");
 #endif
   switch (memory_format) {
-    case standalone::c10::MemoryFormat::Contiguous: {
+    case executorch::backends::aoti::slim::c10::MemoryFormat::Contiguous: {
       // dim_ is a virtual call, don't repeat it
       const auto dim_ = dim();
       sizes_and_strides_.resize(dim_);
@@ -25,7 +25,7 @@ inline void SlimTensor::empty_tensor_restride(
         const auto last_idx = dim_ - 1;
         sizes_and_strides_.stride_at_unchecked(last_idx) = 1;
         for (int64_t i = static_cast<int64_t>(last_idx) - 1; i >= 0; --i) {
-          overflowed |= standalone::c10::mul_overflows(
+          overflowed |= executorch::backends::aoti::slim::c10::mul_overflows(
               sizes_and_strides_.stride_at_unchecked(i + 1),
               std::max<int64_t>(sizes_and_strides_.size_at_unchecked(i + 1), 1),
               std::addressof(sizes_and_strides_.stride_at_unchecked(i)));
@@ -34,24 +34,24 @@ inline void SlimTensor::empty_tensor_restride(
       }
       break;
     }
-    case standalone::c10::MemoryFormat::ChannelsLast: {
+    case executorch::backends::aoti::slim::c10::MemoryFormat::ChannelsLast: {
       STANDALONE_CHECK(
           dim() == 4, "required rank 4 tensor to use channels_last format");
       set_sizes_and_strides(sizes(), get_channels_last_strides_2d(sizes()));
       break;
     }
-    case standalone::c10::MemoryFormat::ChannelsLast3d: {
+    case executorch::backends::aoti::slim::c10::MemoryFormat::ChannelsLast3d: {
       STANDALONE_CHECK(
           dim() == 5, "required rank 5 tensor to use channels_last_3d format");
       set_sizes_and_strides(sizes(), get_channels_last_strides_3d(sizes()));
       break;
     }
-    case standalone::c10::MemoryFormat::Preserve:
+    case executorch::backends::aoti::slim::c10::MemoryFormat::Preserve:
       STANDALONE_CHECK(false, "unsupported memory format ", memory_format);
       // Cleaning warning messages, no need to break as STANDALONE_CHECK(false)
       // terminates flow.
       // break;
-    case standalone::c10::MemoryFormat::NumOptions:
+    case executorch::backends::aoti::slim::c10::MemoryFormat::NumOptions:
       STANDALONE_INTERNAL_ASSERT(
           false, "invalid memory format ", memory_format);
   }
@@ -125,8 +125,8 @@ inline void _maybe_resize_storage(SlimTensor* self, int64_t new_size_bytes) {
 
 inline SlimTensor* _resize_impl_(
     SlimTensor* self,
-    standalone::c10::IntArrayRef sizes,
-    std::optional<standalone::c10::IntArrayRef> strides,
+    executorch::backends::aoti::slim::c10::IntArrayRef sizes,
+    std::optional<executorch::backends::aoti::slim::c10::IntArrayRef> strides,
     bool resize_storage) {
   if (self->sizes() == sizes &&
       (!strides || self->strides() == strides.value())) {
@@ -154,16 +154,17 @@ inline SlimTensor* _resize_impl_(
 }
 
 inline SlimTensor SlimTensor::resize_(
-    standalone::c10::IntArrayRef sizes,
+    executorch::backends::aoti::slim::c10::IntArrayRef sizes,
     std::optional<c10::MemoryFormat> optional_memory_format) {
   _resize_impl_(this, sizes, /*stride=*/std::nullopt, true);
 
   if (optional_memory_format.has_value()) {
-    standalone::c10::MemoryFormat memory_format =
-        static_cast<standalone::c10::MemoryFormat>(
+    executorch::backends::aoti::slim::c10::MemoryFormat memory_format =
+        static_cast<executorch::backends::aoti::slim::c10::MemoryFormat>(
             optional_memory_format.value());
     STANDALONE_CHECK(
-        memory_format != standalone::c10::MemoryFormat::Preserve,
+        memory_format !=
+            executorch::backends::aoti::slim::c10::MemoryFormat::Preserve,
         "Unsupported memory format",
         memory_format);
     this->empty_tensor_restride(memory_format);
@@ -171,4 +172,4 @@ inline SlimTensor SlimTensor::resize_(
   return *this;
 }
 
-} // namespace standalone::slim
+} // namespace executorch::backends::aoti::slim
diff --git a/backends/aoti/slim/core/SlimTensorView-incl.h b/backends/aoti/slim/core/SlimTensorView-incl.h
index 0df4c4705f1..c247047900c 100644
--- a/backends/aoti/slim/core/SlimTensorView-incl.h
+++ b/backends/aoti/slim/core/SlimTensorView-incl.h
@@ -6,10 +6,10 @@
 #include <executorch/backends/aoti/slim/c10/util/ArrayRef.h>
 #include <executorch/backends/aoti/slim/util/SizeUtil.h>
 
-namespace standalone::slim {
+namespace executorch::backends::aoti::slim {
 inline SlimTensor SlimTensor::as_strided(
-    standalone::c10::IntArrayRef sizes,
-    standalone::c10::IntArrayRef strides,
+    executorch::backends::aoti::slim::c10::IntArrayRef sizes,
+    executorch::backends::aoti::slim::c10::IntArrayRef strides,
     int64_t storage_offset) const {
   SlimTensor result = *this;
   result.as_strided_(sizes, strides, storage_offset);
@@ -17,8 +17,8 @@ inline SlimTensor SlimTensor::as_strided(
 }
 
 inline SlimTensor SlimTensor::as_strided_(
-    standalone::c10::IntArrayRef sizes,
-    standalone::c10::IntArrayRef strides,
+    executorch::backends::aoti::slim::c10::IntArrayRef sizes,
+    executorch::backends::aoti::slim::c10::IntArrayRef strides,
     int64_t storage_offset) {
   STANDALONE_CHECK(
       sizes.size() == strides.size(),
@@ -44,20 +44,22 @@ inline SlimTensor SlimTensor::as_strided_(
   return *this;
 }
 
-inline SlimTensor SlimTensor::permute(standalone::c10::IntArrayRef dims) const {
+inline SlimTensor SlimTensor::permute(
+    executorch::backends::aoti::slim::c10::IntArrayRef dims) const {
   const size_t ndim = this->dim();
   STANDALONE_CHECK(
       ndim == static_cast<size_t>(dims.size()),
       "permute: dims length must be equal to tensor.dim()")
 
-  standalone::c10::ArrayRef old_sizes = this->sizes();
-  standalone::c10::ArrayRef old_strides = this->strides();
+  executorch::backends::aoti::slim::c10::ArrayRef old_sizes = this->sizes();
+  executorch::backends::aoti::slim::c10::ArrayRef old_strides = this->strides();
   std::vector<int64_t> new_sizes = old_sizes.vec();
   std::vector<int64_t> new_strides = old_strides.vec();
   std::vector<bool> seen_dims(ndim, false);
 
   for (size_t i = 0; i < ndim; i++) {
-    int64_t d = standalone::c10::maybe_wrap_dim(dims[i], ndim);
+    int64_t d =
+        executorch::backends::aoti::slim::c10::maybe_wrap_dim(dims[i], ndim);
     STANDALONE_CHECK(!seen_dims[d], "permute: duplicate dims are not allowed");
     seen_dims[d] = true;
     new_sizes[i] = old_sizes[d];
@@ -82,8 +84,8 @@ inline SlimTensor SlimTensor::transpose(int64_t dim0, int64_t dim1) const {
   }
 
   // Wrap dimensions and swap them
-  dim0 = standalone::c10::maybe_wrap_dim(dim0, ndim);
-  dim1 = standalone::c10::maybe_wrap_dim(dim1, ndim);
+  dim0 = executorch::backends::aoti::slim::c10::maybe_wrap_dim(dim0, ndim);
+  dim1 = executorch::backends::aoti::slim::c10::maybe_wrap_dim(dim1, ndim);
   std::swap(dims[dim0], dims[dim1]);
 
   return permute(dims);
@@ -94,7 +96,7 @@ inline SlimTensor SlimTensor::t() const {
 }
 
 inline SlimTensor SlimTensor::reshape(
-    standalone::c10::IntArrayRef proposed_shape) const {
+    executorch::backends::aoti::slim::c10::IntArrayRef proposed_shape) const {
   std::vector<int64_t> final_shape_vec =
       infer_size(proposed_shape, this->numel());
 
@@ -124,8 +126,9 @@ inline SlimTensor SlimTensor::narrow(int64_t dim, int64_t start, int64_t length)
     const {
   STANDALONE_CHECK(
       this->dim() > 0, "narrow() cannot be applied to a 0-dim tensor.");
-  dim = standalone::c10::maybe_wrap_dim(dim, static_cast<int64_t>(this->dim()));
-  start = standalone::c10::maybe_wrap_dim(
+  dim = executorch::backends::aoti::slim::c10::maybe_wrap_dim(
+      dim, static_cast<int64_t>(this->dim()));
+  start = executorch::backends::aoti::slim::c10::maybe_wrap_dim(
       start, static_cast<int64_t>(this->size(dim)));
 
   STANDALONE_CHECK(length >= 0, "narrow(): length must be non-negative.");
@@ -149,4 +152,4 @@ inline SlimTensor SlimTensor::narrow(int64_t dim, int64_t start, int64_t length)
   return result;
 }
 
-} // namespace standalone::slim
+} // namespace executorch::backends::aoti::slim
diff --git a/backends/aoti/slim/core/Storage.h b/backends/aoti/slim/core/Storage.h
index 4230a0d2b0a..135b44bca23 100644
--- a/backends/aoti/slim/core/Storage.h
+++ b/backends/aoti/slim/core/Storage.h
@@ -16,29 +16,35 @@
 #include <executorch/backends/aoti/slim/util/SharedPtr.h>
 #include <executorch/backends/aoti/slim/util/SizeUtil.h>
 
-namespace standalone::slim {
+namespace executorch::backends::aoti::slim {
 using DeleterFn = void (*)(void*);
 
 namespace detail {
 inline void noop(void*) {}
 } // namespace detail
 
-const standalone::c10::Device CPU_DEVICE =
-    standalone::c10::Device(standalone::c10::DeviceType::CPU, 0);
+const executorch::backends::aoti::slim::c10::Device CPU_DEVICE =
+    executorch::backends::aoti::slim::c10::Device(
+        executorch::backends::aoti::slim::c10::DeviceType::CPU,
+        0);
 
-const standalone::c10::Device DEFAULT_CUDA_DEVICE =
-    standalone::c10::Device(standalone::c10::DeviceType::CUDA, 0);
+const executorch::backends::aoti::slim::c10::Device DEFAULT_CUDA_DEVICE =
+    executorch::backends::aoti::slim::c10::Device(
+        executorch::backends::aoti::slim::c10::DeviceType::CUDA,
+        0);
 
-// standalone::c10::Device traits template for device-specific operations
-template <standalone::c10::DeviceType D>
+// executorch::backends::aoti::slim::c10::Device traits template for
+// device-specific operations
+template <executorch::backends::aoti::slim::c10::DeviceType D>
 struct DeviceTraits;
 
 // CPU specialization
 template <>
-struct DeviceTraits<standalone::c10::DeviceType::CPU> {
+struct DeviceTraits<executorch::backends::aoti::slim::c10::DeviceType::CPU> {
   static void* allocate(
       size_t nbytes,
-      const standalone::c10::Device& device = CPU_DEVICE) {
+      const executorch::backends::aoti::slim::c10::Device& device =
+          CPU_DEVICE) {
     // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
     return malloc(nbytes);
   }
@@ -52,8 +58,8 @@ struct DeviceTraits<standalone::c10::DeviceType::CPU> {
       void* dst,
       const void* src,
       size_t nbytes,
-      const standalone::c10::Device& dst_device,
-      const standalone::c10::Device& src_device) {
+      const executorch::backends::aoti::slim::c10::Device& dst_device,
+      const executorch::backends::aoti::slim::c10::Device& src_device) {
     std::memcpy(dst, src, nbytes);
   }
 };
@@ -61,9 +67,11 @@ struct DeviceTraits<standalone::c10::DeviceType::CPU> {
 // CUDA specialization
 #ifdef USE_CUDA
 template <>
-struct DeviceTraits<standalone::c10::DeviceType::CUDA> {
-  static void* allocate(size_t nbytes, const standalone::c10::Device& device) {
-    standalone::slim::cuda::CUDAGuard guard(device);
+struct DeviceTraits<executorch::backends::aoti::slim::c10::DeviceType::CUDA> {
+  static void* allocate(
+      size_t nbytes,
+      const executorch::backends::aoti::slim::c10::Device& device) {
+    executorch::backends::aoti::slim::cuda::CUDAGuard guard(device);
     void* data = nullptr;
     STANDALONE_CUDA_CHECK(cudaMalloc(&data, nbytes));
     return data;
@@ -77,11 +85,11 @@ struct DeviceTraits<standalone::c10::DeviceType::CUDA> {
       void* dst,
       const void* src,
       size_t nbytes,
-      const standalone::c10::Device& dst_device,
-      const standalone::c10::Device& src_device) {
+      const executorch::backends::aoti::slim::c10::Device& dst_device,
+      const executorch::backends::aoti::slim::c10::Device& src_device) {
     // Determine the direction
     cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;
-    standalone::c10::Device cuda_device =
+    executorch::backends::aoti::slim::c10::Device cuda_device =
         dst_device; // Default to destination device
 
     if (src_device.is_cpu()) {
@@ -98,14 +106,16 @@ struct DeviceTraits<standalone::c10::DeviceType::CUDA> {
           dst_device.index());
     }
     // Set up CUDA context for the appropriate device
-    standalone::slim::cuda::CUDAGuard guard(cuda_device);
+    executorch::backends::aoti::slim::cuda::CUDAGuard guard(cuda_device);
     STANDALONE_CUDA_CHECK(cudaMemcpy(dst, src, nbytes, direction));
   }
 };
 #else
 template <>
-struct DeviceTraits<standalone::c10::DeviceType::CUDA> {
-  static void* allocate(size_t nbytes, const standalone::c10::Device& device) {
+struct DeviceTraits<executorch::backends::aoti::slim::c10::DeviceType::CUDA> {
+  static void* allocate(
+      size_t nbytes,
+      const executorch::backends::aoti::slim::c10::Device& device) {
     STANDALONE_CHECK(false, "Build with USE_CUDA=1 to enable CUDA support");
   }
 
@@ -117,8 +127,8 @@ struct DeviceTraits<standalone::c10::DeviceType::CUDA> {
       void* dst,
       const void* src,
       size_t nbytes,
-      const standalone::c10::Device& dst_device,
-      const standalone::c10::Device& src_device) {
+      const executorch::backends::aoti::slim::c10::Device& dst_device,
+      const executorch::backends::aoti::slim::c10::Device& src_device) {
     STANDALONE_CHECK(false, "Build with USE_CUDA=1 to enable CUDA support");
   }
 };
@@ -129,24 +139,29 @@ struct DeviceTraits<standalone::c10::DeviceType::CUDA> {
 // non-owning.
 class MaybeOwningStorage {
  public:
-  MaybeOwningStorage(const standalone::c10::Device& device, size_t nbytes)
+  MaybeOwningStorage(
+      const executorch::backends::aoti::slim::c10::Device& device,
+      size_t nbytes)
       : device_(device), capacity_(nbytes), is_owning_(true) {
     // Allocating memory here so owning_ has to be true.
     if (device.is_cpu()) {
-      data_ = DeviceTraits<standalone::c10::DeviceType::CPU>::allocate(
-          nbytes, device);
-      deleter_ = DeviceTraits<standalone::c10::DeviceType::CPU>::free;
+      data_ =
+          DeviceTraits<executorch::backends::aoti::slim::c10::DeviceType::CPU>::
+              allocate(nbytes, device);
+      deleter_ = DeviceTraits<
+          executorch::backends::aoti::slim::c10::DeviceType::CPU>::free;
     } else if (device.is_cuda()) {
-      data_ = DeviceTraits<standalone::c10::DeviceType::CUDA>::allocate(
-          nbytes, device);
-      deleter_ = DeviceTraits<standalone::c10::DeviceType::CUDA>::free;
+      data_ = DeviceTraits<executorch::backends::aoti::slim::c10::DeviceType::
+                               CUDA>::allocate(nbytes, device);
+      deleter_ = DeviceTraits<
+          executorch::backends::aoti::slim::c10::DeviceType::CUDA>::free;
     } else {
       STANDALONE_CHECK(false, "Unsupported device type");
     }
   }
 
   MaybeOwningStorage(
-      const standalone::c10::Device& device,
+      const executorch::backends::aoti::slim::c10::Device& device,
       void* data,
       size_t nbytes)
       : device_(device), data_(data), capacity_(nbytes), is_owning_(false) {
@@ -201,7 +216,7 @@ class MaybeOwningStorage {
       void* dst_data_ptr,
       void* src_data_ptr,
       size_t nbytes,
-      const standalone::c10::Device& src_device) {
+      const executorch::backends::aoti::slim::c10::Device& src_device) {
     STANDALONE_CHECK(
         dst_data_ptr, "Storage clone failed: dst_data_ptr can not be nullptr")
     STANDALONE_CHECK(
@@ -221,7 +236,8 @@ class MaybeOwningStorage {
     }
   }
 
-  MaybeOwningStorage clone(const standalone::c10::Device& device) const {
+  MaybeOwningStorage clone(
+      const executorch::backends::aoti::slim::c10::Device& device) const {
     STANDALONE_CHECK(
         data_, "Storage clone failed: source data can not be nullptr")
     // Create a new owning storage with the specified device and same capacity
@@ -230,12 +246,12 @@ class MaybeOwningStorage {
     // Copy the data from the current storage to the new storage
     if (device_.is_cpu() && device.is_cpu()) {
       // CPU to CPU copy
-      DeviceTraits<standalone::c10::DeviceType::CPU>::memcpy(
-          cloned_storage.data_, data_, capacity_, device, device_);
+      DeviceTraits<executorch::backends::aoti::slim::c10::DeviceType::CPU>::
+          memcpy(cloned_storage.data_, data_, capacity_, device, device_);
     } else {
       // At least one of the devices is CUDA
-      DeviceTraits<standalone::c10::DeviceType::CUDA>::memcpy(
-          cloned_storage.data_, data_, capacity_, device, device_);
+      DeviceTraits<executorch::backends::aoti::slim::c10::DeviceType::CUDA>::
+          memcpy(cloned_storage.data_, data_, capacity_, device, device_);
     }
 
     return cloned_storage;
@@ -249,7 +265,7 @@ class MaybeOwningStorage {
     return data_;
   }
 
-  const standalone::c10::Device& device() const {
+  const executorch::backends::aoti::slim::c10::Device& device() const {
     return device_;
   }
 
@@ -286,7 +302,7 @@ class MaybeOwningStorage {
   }
 
  private:
-  standalone::c10::Device device_ = CPU_DEVICE;
+  executorch::backends::aoti::slim::c10::Device device_ = CPU_DEVICE;
   void* data_ = nullptr;
   size_t capacity_ = 0;
   DeleterFn deleter_ = detail::noop;
@@ -296,12 +312,15 @@ class MaybeOwningStorage {
 using Storage = SharedPtr<MaybeOwningStorage>;
 
 inline Storage new_storage(
-    standalone::c10::IntArrayRef sizes,
-    standalone::c10::IntArrayRef strides,
-    standalone::c10::ScalarType dtype,
-    const standalone::c10::Device& device = CPU_DEVICE) {
+    executorch::backends::aoti::slim::c10::IntArrayRef sizes,
+    executorch::backends::aoti::slim::c10::IntArrayRef strides,
+    executorch::backends::aoti::slim::c10::ScalarType dtype,
+    const executorch::backends::aoti::slim::c10::Device& device = CPU_DEVICE) {
   size_t nbytes = compute_storage_nbytes(
-      sizes, strides, standalone::c10::elementSize(dtype), 0);
+      sizes,
+      strides,
+      executorch::backends::aoti::slim::c10::elementSize(dtype),
+      0);
   return Storage(new MaybeOwningStorage(device, nbytes));
 }
-} // namespace standalone::slim
+} // namespace executorch::backends::aoti::slim
diff --git a/backends/aoti/slim/cuda/Guard.h b/backends/aoti/slim/cuda/Guard.h
index c9b2441b148..2fcafce92f9 100644
--- a/backends/aoti/slim/cuda/Guard.h
+++ b/backends/aoti/slim/cuda/Guard.h
@@ -14,19 +14,20 @@
 #include <executorch/backends/aoti/slim/c10/core/Device.h>
 #include <executorch/backends/aoti/slim/c10/cuda/Exception.h>
 
-namespace standalone::slim::cuda {
+namespace executorch::backends::aoti::slim::cuda {
 
 // Thread-local stream management
 namespace detail {
-inline thread_local std::
-    unordered_map<standalone::c10::DeviceIndex, cudaStream_t>
-        current_streams_;
+inline thread_local std::unordered_map<
+    executorch::backends::aoti::slim::c10::DeviceIndex,
+    cudaStream_t>
+    current_streams_;
 }
 
 /// Set the current CUDA stream for the specified device
 inline void setCurrentCUDAStream(
     cudaStream_t stream,
-    standalone::c10::DeviceIndex device_index = -1) {
+    executorch::backends::aoti::slim::c10::DeviceIndex device_index = -1) {
   if (device_index == -1) {
     // Get current device if not specified
     int current_device;
@@ -39,7 +40,7 @@ inline void setCurrentCUDAStream(
 
 /// Get the current CUDA stream for the specified device
 inline cudaStream_t getCurrentCUDAStream(
-    standalone::c10::DeviceIndex device_index = -1) {
+    executorch::backends::aoti::slim::c10::DeviceIndex device_index = -1) {
   if (device_index == -1) {
     // Get current device if not specified
     int current_device;
@@ -64,13 +65,14 @@ struct CUDAGuard {
   explicit CUDAGuard() = delete;
 
   /// Set the current CUDA device to the passed device index.
-  explicit CUDAGuard(standalone::c10::DeviceIndex device_index) {
+  explicit CUDAGuard(
+      executorch::backends::aoti::slim::c10::DeviceIndex device_index) {
     set_index(device_index);
   }
 
   /// Sets the current CUDA device to the passed device.  Errors if the passed
   /// device is not a CUDA device.
-  explicit CUDAGuard(standalone::c10::Device device) {
+  explicit CUDAGuard(executorch::backends::aoti::slim::c10::Device device) {
     STANDALONE_CHECK(
         device.is_cuda(),
         "Expected a CUDA device for CUDAGuard, but got ",
@@ -94,7 +96,8 @@ struct CUDAGuard {
   }
 
   /// Sets the CUDA device to the given device index.
-  void set_index(standalone::c10::DeviceIndex device_index) {
+  void set_index(
+      executorch::backends::aoti::slim::c10::DeviceIndex device_index) {
     int orig_index = -1;
     STANDALONE_CUDA_CHECK(cudaGetDevice(&orig_index));
 
@@ -107,8 +110,8 @@ struct CUDAGuard {
 
  private:
   /// The guard for the current device.
-  standalone::c10::DeviceIndex original_device_index_;
-  standalone::c10::DeviceIndex current_device_index_;
+  executorch::backends::aoti::slim::c10::DeviceIndex original_device_index_;
+  executorch::backends::aoti::slim::c10::DeviceIndex current_device_index_;
 };
 
 struct CUDAStreamGuard {
@@ -118,7 +121,7 @@ struct CUDAStreamGuard {
   /// Set the current CUDA stream to the passed stream on the specified device.
   explicit CUDAStreamGuard(
       cudaStream_t stream,
-      standalone::c10::DeviceIndex device_index)
+      executorch::backends::aoti::slim::c10::DeviceIndex device_index)
       : device_guard_(device_index) {
     set_stream(stream, device_index);
   }
@@ -140,7 +143,7 @@ struct CUDAStreamGuard {
   /// Sets the CUDA stream to the given stream on the specified device.
   void set_stream(
       cudaStream_t stream,
-      standalone::c10::DeviceIndex device_index) {
+      executorch::backends::aoti::slim::c10::DeviceIndex device_index) {
     // Store the original stream for this device
     original_stream_ = getCurrentCUDAStream(device_index);
     current_stream_ = stream;
@@ -156,7 +159,7 @@ struct CUDAStreamGuard {
   }
 
   /// Get the device index being guarded
-  standalone::c10::DeviceIndex device_index() const {
+  executorch::backends::aoti::slim::c10::DeviceIndex device_index() const {
     return device_index_;
   }
 
@@ -168,7 +171,7 @@ struct CUDAStreamGuard {
   /// The current stream being guarded
   cudaStream_t current_stream_ = nullptr;
   /// The device index for this stream guard
-  standalone::c10::DeviceIndex device_index_;
+  executorch::backends::aoti::slim::c10::DeviceIndex device_index_;
 };
 
-} // namespace standalone::slim::cuda
+} // namespace executorch::backends::aoti::slim::cuda
diff --git a/backends/aoti/slim/factory/Empty.h b/backends/aoti/slim/factory/Empty.h
index bbd4996b84c..20dd89fe1e6 100644
--- a/backends/aoti/slim/factory/Empty.h
+++ b/backends/aoti/slim/factory/Empty.h
@@ -7,23 +7,23 @@
 #include <executorch/backends/aoti/slim/core/SlimTensor.h>
 #include <executorch/backends/aoti/slim/util/SizeUtil.h>
 
-namespace standalone::slim {
+namespace executorch::backends::aoti::slim {
 // The returned SlimTensor owns the underlying storage
 inline SlimTensor empty_strided(
-    standalone::c10::IntArrayRef sizes,
-    standalone::c10::IntArrayRef strides,
-    standalone::c10::ScalarType dtype,
-    const standalone::c10::Device& device = CPU_DEVICE) {
+    executorch::backends::aoti::slim::c10::IntArrayRef sizes,
+    executorch::backends::aoti::slim::c10::IntArrayRef strides,
+    executorch::backends::aoti::slim::c10::ScalarType dtype,
+    const executorch::backends::aoti::slim::c10::Device& device = CPU_DEVICE) {
   Storage storage = new_storage(sizes, strides, dtype, device);
   return SlimTensor(std::move(storage), sizes, strides, dtype, 0);
 }
 
 inline SlimTensor empty(
-    standalone::c10::IntArrayRef sizes,
-    standalone::c10::ScalarType dtype,
-    const standalone::c10::Device& device = CPU_DEVICE) {
+    executorch::backends::aoti::slim::c10::IntArrayRef sizes,
+    executorch::backends::aoti::slim::c10::ScalarType dtype,
+    const executorch::backends::aoti::slim::c10::Device& device = CPU_DEVICE) {
   std::vector<int64_t> contig_strides =
-      standalone::slim::compute_contiguous_strides(sizes);
+      executorch::backends::aoti::slim::compute_contiguous_strides(sizes);
   Storage storage = new_storage(sizes, contig_strides, dtype, device);
   return SlimTensor(std::move(storage), sizes, contig_strides, dtype, 0);
 }
@@ -32,4 +32,4 @@ inline SlimTensor empty_like(const SlimTensor& other) {
   return empty_strided(
       other.sizes(), other.strides(), other.dtype(), other.device());
 }
-} // namespace standalone::slim
+} // namespace executorch::backends::aoti::slim
diff --git a/backends/aoti/slim/factory/Factory.h b/backends/aoti/slim/factory/Factory.h
index 5e172bc9f6a..f0d26041ad3 100644
--- a/backends/aoti/slim/factory/Factory.h
+++ b/backends/aoti/slim/factory/Factory.h
@@ -2,13 +2,13 @@
 
 #include <executorch/backends/aoti/slim/factory/Empty.h>
 
-namespace standalone::slim {
+namespace executorch::backends::aoti::slim {
 inline SlimTensor zeros(
-    standalone::c10::IntArrayRef sizes,
-    standalone::c10::ScalarType dtype,
-    const standalone::c10::Device& device = CPU_DEVICE) {
+    executorch::backends::aoti::slim::c10::IntArrayRef sizes,
+    executorch::backends::aoti::slim::c10::ScalarType dtype,
+    const executorch::backends::aoti::slim::c10::Device& device = CPU_DEVICE) {
   SlimTensor tensor = empty(sizes, dtype, device);
-  tensor.fill_(standalone::c10::Scalar(0));
+  tensor.fill_(executorch::backends::aoti::slim::c10::Scalar(0));
   return tensor;
 }
 
@@ -17,11 +17,11 @@ inline SlimTensor zeros_like(const SlimTensor& other) {
 }
 
 inline SlimTensor ones(
-    standalone::c10::IntArrayRef sizes,
-    standalone::c10::ScalarType dtype,
-    const standalone::c10::Device& device = CPU_DEVICE) {
+    executorch::backends::aoti::slim::c10::IntArrayRef sizes,
+    executorch::backends::aoti::slim::c10::ScalarType dtype,
+    const executorch::backends::aoti::slim::c10::Device& device = CPU_DEVICE) {
   SlimTensor tensor = empty(sizes, dtype, device);
-  tensor.fill_(standalone::c10::Scalar(1));
+  tensor.fill_(executorch::backends::aoti::slim::c10::Scalar(1));
   return tensor;
 }
 
@@ -29,4 +29,4 @@ inline SlimTensor ones_like(const SlimTensor& other) {
   return ones(other.sizes(), other.dtype(), other.device());
 }
 
-} // namespace standalone::slim
+} // namespace executorch::backends::aoti::slim
diff --git a/backends/aoti/slim/factory/FromBlob.h b/backends/aoti/slim/factory/FromBlob.h
index d1877f7f31d..c7a558f72ed 100644
--- a/backends/aoti/slim/factory/FromBlob.h
+++ b/backends/aoti/slim/factory/FromBlob.h
@@ -2,15 +2,15 @@
 
 #include <executorch/backends/aoti/slim/factory/Empty.h>
 
-namespace standalone::slim {
+namespace executorch::backends::aoti::slim {
 
 // The returned SlimTensor does not own the underlying storage
 inline SlimTensor from_blob(
     void* data,
-    standalone::c10::IntArrayRef sizes,
-    standalone::c10::IntArrayRef strides,
-    standalone::c10::ScalarType dtype,
-    const standalone::c10::Device& device = CPU_DEVICE,
+    executorch::backends::aoti::slim::c10::IntArrayRef sizes,
+    executorch::backends::aoti::slim::c10::IntArrayRef strides,
+    executorch::backends::aoti::slim::c10::ScalarType dtype,
+    const executorch::backends::aoti::slim::c10::Device& device = CPU_DEVICE,
     int64_t storage_offset = 0) {
   STANDALONE_CHECK(data != nullptr, "data pointer can not be nullptr");
 
@@ -24,13 +24,13 @@ inline SlimTensor from_blob(
 
 inline SlimTensor from_blob(
     void* data,
-    standalone::c10::IntArrayRef sizes,
-    standalone::c10::ScalarType dtype,
-    const standalone::c10::Device& device = CPU_DEVICE,
+    executorch::backends::aoti::slim::c10::IntArrayRef sizes,
+    executorch::backends::aoti::slim::c10::ScalarType dtype,
+    const executorch::backends::aoti::slim::c10::Device& device = CPU_DEVICE,
     int64_t storage_offset = 0) {
   std::vector<int64_t> contig_strides =
-      standalone::slim::compute_contiguous_strides(sizes);
+      executorch::backends::aoti::slim::compute_contiguous_strides(sizes);
   return from_blob(data, sizes, contig_strides, dtype, device, storage_offset);
 }
 
-} // namespace standalone::slim
+} // namespace executorch::backends::aoti::slim
diff --git a/backends/aoti/slim/factory/FromScalar.h b/backends/aoti/slim/factory/FromScalar.h
index 223f734d940..df01121a6f7 100644
--- a/backends/aoti/slim/factory/FromScalar.h
+++ b/backends/aoti/slim/factory/FromScalar.h
@@ -2,14 +2,14 @@
 
 #include <executorch/backends/aoti/slim/factory/Empty.h>
 
-namespace standalone::slim {
+namespace executorch::backends::aoti::slim {
 
 inline SlimTensor scalar_to_tensor(
-    const standalone::c10::Scalar& s,
-    const standalone::c10::Device& device = CPU_DEVICE) {
+    const executorch::backends::aoti::slim::c10::Scalar& s,
+    const executorch::backends::aoti::slim::c10::Device& device = CPU_DEVICE) {
   SlimTensor result = empty_strided({}, {}, s.type(), device);
   result.fill_(s);
   return result;
 }
 
-} // namespace standalone::slim
+} // namespace executorch::backends::aoti::slim
diff --git a/backends/aoti/slim/factory/Pad.h b/backends/aoti/slim/factory/Pad.h
index 4d7fef731bd..44a83696a14 100644
--- a/backends/aoti/slim/factory/Pad.h
+++ b/backends/aoti/slim/factory/Pad.h
@@ -2,15 +2,15 @@
 
 #include <executorch/backends/aoti/slim/factory/Empty.h>
 
-namespace standalone::slim {
+namespace executorch::backends::aoti::slim {
 
 inline SlimTensor constant_pad_nd(
     const SlimTensor& self,
-    standalone::c10::IntArrayRef pad,
-    const standalone::c10::Scalar& value) {
+    executorch::backends::aoti::slim::c10::IntArrayRef pad,
+    const executorch::backends::aoti::slim::c10::Scalar& value) {
   STANDALONE_CHECK(pad.size() % 2 == 0, "Length of pad must be even");
 
-  standalone::c10::IntArrayRef input_sizes = self.sizes();
+  executorch::backends::aoti::slim::c10::IntArrayRef input_sizes = self.sizes();
   int64_t l_inp = self.dim();
   int64_t l_pad = static_cast<int64_t>(pad.size()) / 2;
   int64_t l_diff = l_inp - l_pad;
@@ -50,7 +50,8 @@ inline SlimTensor constant_pad_nd(
     new_shape.emplace_back(input_sizes[i]);
   }
 
-  for (const auto i : standalone::c10::irange((size_t)l_pad)) {
+  for (const auto i :
+       executorch::backends::aoti::slim::c10::irange((size_t)l_pad)) {
     auto pad_idx = pad.size() - ((i + 1) * 2);
     auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
     STANDALONE_CHECK(
@@ -73,7 +74,8 @@ inline SlimTensor constant_pad_nd(
 
   // create a view into the center of the output tensor
   SlimTensor c_output = output;
-  for (const auto i : standalone::c10::irange(l_diff, l_inp)) {
+  for (const auto i :
+       executorch::backends::aoti::slim::c10::irange(l_diff, l_inp)) {
     auto pad_idx = 2 * (l_inp - i - 1);
     if (pad[pad_idx] > 0) {
       c_output =
@@ -90,7 +92,7 @@ inline SlimTensor constant_pad_nd(
 
 inline SlimTensor pad(
     const SlimTensor& self,
-    standalone::c10::IntArrayRef pad,
+    executorch::backends::aoti::slim::c10::IntArrayRef pad,
     std::string_view mode,
     std::optional<double> value) {
   if (mode == "constant") {
@@ -103,4 +105,4 @@ inline SlimTensor pad(
       ". Only constant mode is available.");
 }
 
-} // namespace standalone::slim
+} // namespace executorch::backends::aoti::slim
diff --git a/backends/aoti/slim/tests/test_slim_tensor_basic.cpp b/backends/aoti/slim/tests/test_slim_tensor_basic.cpp
index 37b6ccb240d..e37f252c740 100644
--- a/backends/aoti/slim/tests/test_slim_tensor_basic.cpp
+++ b/backends/aoti/slim/tests/test_slim_tensor_basic.cpp
@@ -14,31 +14,39 @@
 #include <executorch/backends/aoti/slim/factory/FromBlob.h>
 #include <executorch/backends/aoti/slim/factory/FromScalar.h>
 
-namespace standalone::slim {
+namespace executorch::backends::aoti::slim {
 namespace {
 
 TEST(SlimTensorBasicTest, EmptyTensorCreation) {
-  auto tensor =
-      empty({2, 3, 4}, standalone::c10::ScalarType::Float, CPU_DEVICE);
+  auto tensor = empty(
+      {2, 3, 4},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      CPU_DEVICE);
   EXPECT_EQ(tensor.dim(), 3);
   EXPECT_EQ(tensor.size(0), 2);
   EXPECT_EQ(tensor.size(1), 3);
   EXPECT_EQ(tensor.size(2), 4);
   EXPECT_EQ(tensor.numel(), 24);
-  EXPECT_EQ(tensor.dtype(), standalone::c10::ScalarType::Float);
+  EXPECT_EQ(
+      tensor.dtype(), executorch::backends::aoti::slim::c10::ScalarType::Float);
   EXPECT_TRUE(tensor.is_contiguous());
 }
 
 TEST(SlimTensorBasicTest, EmptyTensorContiguousStrides) {
-  auto tensor =
-      empty({2, 3, 4}, standalone::c10::ScalarType::Float, CPU_DEVICE);
+  auto tensor = empty(
+      {2, 3, 4},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      CPU_DEVICE);
   EXPECT_EQ(tensor.stride(0), 12);
   EXPECT_EQ(tensor.stride(1), 4);
   EXPECT_EQ(tensor.stride(2), 1);
 }
 
 TEST(SlimTensorBasicTest, ZerosTensorCreation) {
-  auto tensor = zeros({3, 3}, standalone::c10::ScalarType::Float, CPU_DEVICE);
+  auto tensor = zeros(
+      {3, 3},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      CPU_DEVICE);
   EXPECT_EQ(tensor.numel(), 9);
   float* data = static_cast<float*>(tensor.data_ptr());
   for (int i = 0; i < 9; ++i) {
@@ -47,7 +55,10 @@ TEST(SlimTensorBasicTest, ZerosTensorCreation) {
 }
 
 TEST(SlimTensorBasicTest, OnesTensorCreation) {
-  auto tensor = ones({2, 2}, standalone::c10::ScalarType::Float, CPU_DEVICE);
+  auto tensor = ones(
+      {2, 2},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      CPU_DEVICE);
   EXPECT_EQ(tensor.numel(), 4);
   float* data = static_cast<float*>(tensor.data_ptr());
   for (int i = 0; i < 4; ++i) {
@@ -56,7 +67,10 @@ TEST(SlimTensorBasicTest, OnesTensorCreation) {
 }
 
 TEST(SlimTensorBasicTest, FillTensor) {
-  auto tensor = empty({2, 3}, standalone::c10::ScalarType::Float, CPU_DEVICE);
+  auto tensor = empty(
+      {2, 3},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      CPU_DEVICE);
   tensor.fill_(5.0f);
   float* data = static_cast<float*>(tensor.data_ptr());
   for (int i = 0; i < 6; ++i) {
@@ -67,7 +81,10 @@ TEST(SlimTensorBasicTest, FillTensor) {
 TEST(SlimTensorBasicTest, FromBlobNonOwning) {
   std::vector<float> data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
   auto tensor = from_blob(
-      data.data(), {2, 3}, standalone::c10::ScalarType::Float, CPU_DEVICE);
+      data.data(),
+      {2, 3},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      CPU_DEVICE);
   EXPECT_EQ(tensor.dim(), 2);
   EXPECT_EQ(tensor.size(0), 2);
   EXPECT_EQ(tensor.size(1), 3);
@@ -76,7 +93,10 @@ TEST(SlimTensorBasicTest, FromBlobNonOwning) {
 }
 
 TEST(SlimTensorBasicTest, Clone) {
-  auto tensor = empty({2, 3}, standalone::c10::ScalarType::Float, CPU_DEVICE);
+  auto tensor = empty(
+      {2, 3},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      CPU_DEVICE);
   tensor.fill_(3.14f);
 
   auto cloned = tensor.clone();
@@ -91,10 +111,16 @@ TEST(SlimTensorBasicTest, Clone) {
 }
 
 TEST(SlimTensorBasicTest, CopyFrom) {
-  auto src = empty({2, 3}, standalone::c10::ScalarType::Float, CPU_DEVICE);
+  auto src = empty(
+      {2, 3},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      CPU_DEVICE);
   src.fill_(2.5f);
 
-  auto dst = empty({2, 3}, standalone::c10::ScalarType::Float, CPU_DEVICE);
+  auto dst = empty(
+      {2, 3},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      CPU_DEVICE);
   dst.copy_(src);
 
   float* dst_data = static_cast<float*>(dst.data_ptr());
@@ -104,7 +130,10 @@ TEST(SlimTensorBasicTest, CopyFrom) {
 }
 
 TEST(SlimTensorBasicTest, Reshape) {
-  auto tensor = empty({2, 6}, standalone::c10::ScalarType::Float, CPU_DEVICE);
+  auto tensor = empty(
+      {2, 6},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      CPU_DEVICE);
   tensor.fill_(1.0f);
 
   auto reshaped = tensor.reshape({3, 4});
@@ -115,15 +144,20 @@ TEST(SlimTensorBasicTest, Reshape) {
 }
 
 TEST(SlimTensorBasicTest, Transpose) {
-  auto tensor = empty({2, 3}, standalone::c10::ScalarType::Float, CPU_DEVICE);
+  auto tensor = empty(
+      {2, 3},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      CPU_DEVICE);
   auto transposed = tensor.transpose(0, 1);
   EXPECT_EQ(transposed.size(0), 3);
   EXPECT_EQ(transposed.size(1), 2);
 }
 
 TEST(SlimTensorBasicTest, Permute) {
-  auto tensor =
-      empty({2, 3, 4}, standalone::c10::ScalarType::Float, CPU_DEVICE);
+  auto tensor = empty(
+      {2, 3, 4},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      CPU_DEVICE);
   auto permuted = tensor.permute({2, 0, 1});
   EXPECT_EQ(permuted.size(0), 4);
   EXPECT_EQ(permuted.size(1), 2);
@@ -131,7 +165,10 @@ TEST(SlimTensorBasicTest, Permute) {
 }
 
 TEST(SlimTensorBasicTest, Narrow) {
-  auto tensor = empty({10}, standalone::c10::ScalarType::Float, CPU_DEVICE);
+  auto tensor = empty(
+      {10},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      CPU_DEVICE);
   for (int i = 0; i < 10; ++i) {
     static_cast<float*>(tensor.data_ptr())[i] = static_cast<float>(i);
   }
@@ -147,8 +184,10 @@ TEST(SlimTensorBasicTest, Narrow) {
 }
 
 TEST(SlimTensorBasicTest, EmptyLike) {
-  auto tensor =
-      empty({2, 3, 4}, standalone::c10::ScalarType::Float, CPU_DEVICE);
+  auto tensor = empty(
+      {2, 3, 4},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      CPU_DEVICE);
   auto empty_like_tensor = empty_like(tensor);
   EXPECT_EQ(empty_like_tensor.sizes(), tensor.sizes());
   EXPECT_EQ(empty_like_tensor.dtype(), tensor.dtype());
@@ -156,7 +195,10 @@ TEST(SlimTensorBasicTest, EmptyLike) {
 }
 
 TEST(SlimTensorBasicTest, ZerosLike) {
-  auto tensor = empty({2, 3}, standalone::c10::ScalarType::Float, CPU_DEVICE);
+  auto tensor = empty(
+      {2, 3},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      CPU_DEVICE);
   auto zeros_tensor = zeros_like(tensor);
   EXPECT_EQ(zeros_tensor.sizes(), tensor.sizes());
 
@@ -167,4 +209,4 @@ TEST(SlimTensorBasicTest, ZerosLike) {
 }
 
 } // namespace
-} // namespace standalone::slim
+} // namespace executorch::backends::aoti::slim
diff --git a/backends/aoti/slim/tests/test_slim_tensor_cuda.cpp b/backends/aoti/slim/tests/test_slim_tensor_cuda.cpp
index 571d4f99893..2bca695fa15 100644
--- a/backends/aoti/slim/tests/test_slim_tensor_cuda.cpp
+++ b/backends/aoti/slim/tests/test_slim_tensor_cuda.cpp
@@ -15,7 +15,7 @@
 #include <executorch/backends/aoti/slim/factory/Empty.h>
 #include <executorch/backends/aoti/slim/factory/Factory.h>
 
-namespace standalone::slim {
+namespace executorch::backends::aoti::slim {
 namespace {
 
 class SlimTensorCUDATest : public ::testing::Test {
@@ -30,22 +30,30 @@ class SlimTensorCUDATest : public ::testing::Test {
 };
 
 TEST_F(SlimTensorCUDATest, EmptyCUDATensorCreation) {
-  auto tensor =
-      empty({2, 3, 4}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE);
+  auto tensor = empty(
+      {2, 3, 4},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      DEFAULT_CUDA_DEVICE);
   EXPECT_EQ(tensor.dim(), 3);
   EXPECT_EQ(tensor.size(0), 2);
   EXPECT_EQ(tensor.size(1), 3);
   EXPECT_EQ(tensor.size(2), 4);
   EXPECT_EQ(tensor.numel(), 24);
-  EXPECT_EQ(tensor.device().type(), standalone::c10::DeviceType::CUDA);
+  EXPECT_EQ(
+      tensor.device().type(),
+      executorch::backends::aoti::slim::c10::DeviceType::CUDA);
   EXPECT_TRUE(tensor.is_contiguous());
 }
 
 TEST_F(SlimTensorCUDATest, ZerosCUDATensor) {
-  auto tensor =
-      zeros({3, 3}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE);
+  auto tensor = zeros(
+      {3, 3},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      DEFAULT_CUDA_DEVICE);
   EXPECT_EQ(tensor.numel(), 9);
-  EXPECT_EQ(tensor.device().type(), standalone::c10::DeviceType::CUDA);
+  EXPECT_EQ(
+      tensor.device().type(),
+      executorch::backends::aoti::slim::c10::DeviceType::CUDA);
 
   std::vector<float> host_data(9);
   cudaMemcpy(
@@ -60,8 +68,10 @@ TEST_F(SlimTensorCUDATest, ZerosCUDATensor) {
 }
 
 TEST_F(SlimTensorCUDATest, OnesCUDATensor) {
-  auto tensor =
-      ones({2, 2}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE);
+  auto tensor = ones(
+      {2, 2},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      DEFAULT_CUDA_DEVICE);
   EXPECT_EQ(tensor.numel(), 4);
 
   std::vector<float> host_data(4);
@@ -77,8 +87,10 @@ TEST_F(SlimTensorCUDATest, OnesCUDATensor) {
 }
 
 TEST_F(SlimTensorCUDATest, FillCUDATensor) {
-  auto tensor =
-      empty({2, 3}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE);
+  auto tensor = empty(
+      {2, 3},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      DEFAULT_CUDA_DEVICE);
   tensor.fill_(5.0f);
 
   std::vector<float> host_data(6);
@@ -94,8 +106,10 @@ TEST_F(SlimTensorCUDATest, FillCUDATensor) {
 }
 
 TEST_F(SlimTensorCUDATest, CloneCUDATensor) {
-  auto tensor =
-      empty({2, 3}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE);
+  auto tensor = empty(
+      {2, 3},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      DEFAULT_CUDA_DEVICE);
   tensor.fill_(3.14f);
 
   auto cloned = tensor.clone();
@@ -116,12 +130,16 @@ TEST_F(SlimTensorCUDATest, CloneCUDATensor) {
 }
 
 TEST_F(SlimTensorCUDATest, CopyCUDAToCUDA) {
-  auto src =
-      empty({2, 3}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE);
+  auto src = empty(
+      {2, 3},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      DEFAULT_CUDA_DEVICE);
   src.fill_(2.5f);
 
-  auto dst =
-      empty({2, 3}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE);
+  auto dst = empty(
+      {2, 3},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      DEFAULT_CUDA_DEVICE);
   dst.copy_(src);
 
   std::vector<float> host_data(6);
@@ -137,12 +155,16 @@ TEST_F(SlimTensorCUDATest, CopyCUDAToCUDA) {
 }
 
 TEST_F(SlimTensorCUDATest, CopyCPUToCUDA) {
-  auto cpu_tensor =
-      empty({2, 3}, standalone::c10::ScalarType::Float, CPU_DEVICE);
+  auto cpu_tensor = empty(
+      {2, 3},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      CPU_DEVICE);
   cpu_tensor.fill_(1.5f);
 
-  auto cuda_tensor =
-      empty({2, 3}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE);
+  auto cuda_tensor = empty(
+      {2, 3},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      DEFAULT_CUDA_DEVICE);
   cuda_tensor.copy_(cpu_tensor);
 
   std::vector<float> host_data(6);
@@ -158,12 +180,16 @@ TEST_F(SlimTensorCUDATest, CopyCPUToCUDA) {
 }
 
 TEST_F(SlimTensorCUDATest, CopyCUDAToCPU) {
-  auto cuda_tensor =
-      empty({2, 3}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE);
+  auto cuda_tensor = empty(
+      {2, 3},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      DEFAULT_CUDA_DEVICE);
   cuda_tensor.fill_(4.5f);
 
-  auto cpu_tensor =
-      empty({2, 3}, standalone::c10::ScalarType::Float, CPU_DEVICE);
+  auto cpu_tensor = empty(
+      {2, 3},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      CPU_DEVICE);
   cpu_tensor.copy_(cuda_tensor);
 
   float* data = static_cast<float*>(cpu_tensor.data_ptr());
@@ -174,14 +200,20 @@ TEST_F(SlimTensorCUDATest, CopyCUDAToCPU) {
 
 TEST_F(SlimTensorCUDATest, CUDAGuard) {
   cuda::CUDAGuard guard(0);
-  auto tensor =
-      empty({2, 3}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE);
-  EXPECT_EQ(tensor.device().type(), standalone::c10::DeviceType::CUDA);
+  auto tensor = empty(
+      {2, 3},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      DEFAULT_CUDA_DEVICE);
+  EXPECT_EQ(
+      tensor.device().type(),
+      executorch::backends::aoti::slim::c10::DeviceType::CUDA);
 }
 
 TEST_F(SlimTensorCUDATest, ReshapeCUDATensor) {
-  auto tensor =
-      empty({2, 6}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE);
+  auto tensor = empty(
+      {2, 6},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      DEFAULT_CUDA_DEVICE);
   auto reshaped = tensor.reshape({3, 4});
   EXPECT_EQ(reshaped.dim(), 2);
   EXPECT_EQ(reshaped.size(0), 3);
@@ -190,8 +222,10 @@ TEST_F(SlimTensorCUDATest, ReshapeCUDATensor) {
 }
 
 TEST_F(SlimTensorCUDATest, TransposeCUDATensor) {
-  auto tensor =
-      empty({2, 3}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE);
+  auto tensor = empty(
+      {2, 3},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      DEFAULT_CUDA_DEVICE);
   auto transposed = tensor.transpose(0, 1);
   EXPECT_EQ(transposed.size(0), 3);
   EXPECT_EQ(transposed.size(1), 2);
@@ -199,8 +233,10 @@ TEST_F(SlimTensorCUDATest, TransposeCUDATensor) {
 }
 
 TEST_F(SlimTensorCUDATest, PermuteCUDATensor) {
-  auto tensor =
-      empty({2, 3, 4}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE);
+  auto tensor = empty(
+      {2, 3, 4},
+      executorch::backends::aoti::slim::c10::ScalarType::Float,
+      DEFAULT_CUDA_DEVICE);
   auto permuted = tensor.permute({2, 0, 1});
   EXPECT_EQ(permuted.size(0), 4);
   EXPECT_EQ(permuted.size(1), 2);
@@ -209,4 +245,4 @@ TEST_F(SlimTensorCUDATest, PermuteCUDATensor) {
 }
 
 } // namespace
-} // namespace standalone::slim
+} // namespace executorch::backends::aoti::slim
diff --git a/backends/aoti/slim/util/SharedPtr.h b/backends/aoti/slim/util/SharedPtr.h
index 9ad565d9ab9..33a4def5845 100644
--- a/backends/aoti/slim/util/SharedPtr.h
+++ b/backends/aoti/slim/util/SharedPtr.h
@@ -7,7 +7,7 @@
 
 #include <executorch/backends/aoti/slim/c10/util/Exception.h>
 
-namespace standalone::slim {
+namespace executorch::backends::aoti::slim {
 
 /**
  * NonAtomicSharedPtr - A lightweight, non-thread-safe shared pointer
@@ -210,7 +210,7 @@ std::shared_ptr<T> make_shared(Args&&... args) {
 
 #else
 template <typename T>
-using SharedPtr = ::standalone::slim::NonAtomicSharedPtr<T>;
+using SharedPtr = ::executorch::backends::aoti::slim::NonAtomicSharedPtr<T>;
 
 // make_shared for NonAtomicSharedPtr
 template <typename T, typename... Args>
@@ -219,4 +219,4 @@ NonAtomicSharedPtr<T> make_shared(Args&&... args) {
 }
 
 #endif // USE_MULTI_THREAD
-} // namespace standalone::slim
+} // namespace executorch::backends::aoti::slim
diff --git a/backends/aoti/slim/util/SizeUtil.h b/backends/aoti/slim/util/SizeUtil.h
index d22416cd176..4eab9fc2329 100644
--- a/backends/aoti/slim/util/SizeUtil.h
+++ b/backends/aoti/slim/util/SizeUtil.h
@@ -11,7 +11,7 @@
 #include <executorch/backends/aoti/slim/c10/util/irange.h>
 #include <executorch/backends/aoti/slim/c10/util/safe_numerics.h>
 
-namespace standalone::slim {
+namespace executorch::backends::aoti::slim {
 #ifndef STANDALONE_MOBILE
 inline constexpr uint64_t storage_max() {
   // int64_t and size_t are used somewhat inconsistently throughout ATen.
@@ -28,9 +28,11 @@ inline constexpr uint64_t storage_max() {
  * tensor. Catches integer overflow that may occur when a tensor
  * using a sparse layout has multiple dimensions with large sizes.
  */
-inline int64_t safe_compute_numel(standalone::c10::IntArrayRef sizes) {
+inline int64_t safe_compute_numel(
+    executorch::backends::aoti::slim::c10::IntArrayRef sizes) {
   uint64_t n = 1;
-  bool overflowed = standalone::c10::safe_multiplies_u64(sizes, &n);
+  bool overflowed =
+      executorch::backends::aoti::slim::c10::safe_multiplies_u64(sizes, &n);
   overflowed |= (n > storage_max());
   STANDALONE_CHECK(!overflowed, "numel: integer multiplication overflow");
   return static_cast<int64_t>(n);
@@ -59,26 +61,30 @@ inline std::vector<int64_t> safe_compute_contiguous_strides(
 }
 #endif // STANDALONE_MOBILE
 
-inline int64_t compute_numel(standalone::c10::IntArrayRef sizes) {
+inline int64_t compute_numel(
+    executorch::backends::aoti::slim::c10::IntArrayRef sizes) {
 #ifndef STANDALONE_MOBILE
   // Use overflow checks if supported by the compiler
   return safe_compute_numel(sizes);
 #else
-  return standalone::c10::multiply_integers(sizes);
+  return executorch::backends::aoti::slim::c10::multiply_integers(sizes);
 #endif
 }
 
 // named computeStorageNbytesContiguous in c10
 inline size_t compute_storage_nbytes_contiguous(
-    standalone::c10::IntArrayRef sizes,
+    executorch::backends::aoti::slim::c10::IntArrayRef sizes,
     size_t itemsize_bytes,
     size_t storage_offset) {
 // Ignore overflow checks on mobile
 #ifndef STANDALONE_MOBILE
   uint64_t size = 1;
-  bool overflowed = standalone::c10::safe_multiplies_u64(sizes, &size);
-  overflowed |= standalone::c10::add_overflows(size, storage_offset, &size);
-  overflowed |= standalone::c10::mul_overflows(size, itemsize_bytes, &size);
+  bool overflowed =
+      executorch::backends::aoti::slim::c10::safe_multiplies_u64(sizes, &size);
+  overflowed |= executorch::backends::aoti::slim::c10::add_overflows(
+      size, storage_offset, &size);
+  overflowed |= executorch::backends::aoti::slim::c10::mul_overflows(
+      size, itemsize_bytes, &size);
   overflowed |= size > storage_max();
   STANDALONE_CHECK(
       !overflowed, "Storage size calculation overflowed with sizes=", sizes);
@@ -91,8 +97,8 @@ inline size_t compute_storage_nbytes_contiguous(
 
 // named computeStorageNbytes in c10
 inline size_t compute_storage_nbytes(
-    standalone::c10::IntArrayRef sizes,
-    standalone::c10::IntArrayRef strides,
+    executorch::backends::aoti::slim::c10::IntArrayRef sizes,
+    executorch::backends::aoti::slim::c10::IntArrayRef strides,
     size_t itemsize_bytes,
     size_t storage_offset) {
   STANDALONE_CHECK(
@@ -109,17 +115,20 @@ inline size_t compute_storage_nbytes(
   // of the last element according to stride
   uint64_t size = storage_offset + 1;
   bool overflowed = false;
-  for (const auto i : standalone::c10::irange(sizes.size())) {
+  for (const auto i :
+       executorch::backends::aoti::slim::c10::irange(sizes.size())) {
     if (sizes[i] == 0) {
       return 0;
     }
 
     uint64_t strided_size = 0;
-    overflowed |=
-        standalone::c10::mul_overflows(strides[i], sizes[i] - 1, &strided_size);
-    overflowed |= standalone::c10::add_overflows(size, strided_size, &size);
+    overflowed |= executorch::backends::aoti::slim::c10::mul_overflows(
+        strides[i], sizes[i] - 1, &strided_size);
+    overflowed |= executorch::backends::aoti::slim::c10::add_overflows(
+        size, strided_size, &size);
   }
-  overflowed |= standalone::c10::mul_overflows(size, itemsize_bytes, &size);
+  overflowed |= executorch::backends::aoti::slim::c10::mul_overflows(
+      size, itemsize_bytes, &size);
   overflowed |= size > storage_max();
   STANDALONE_CHECK(
       !overflowed,
@@ -132,7 +141,8 @@ inline size_t compute_storage_nbytes(
   // size of the underlying storage is 1 bigger than the offset
   // of the last element according to stride
   uint64_t size = 1;
-  for (const auto i : standalone::c10::irange(sizes.size())) {
+  for (const auto i :
+       executorch::backends::aoti::slim::c10::irange(sizes.size())) {
     if (sizes[i] == 0) {
       return 0;
     }
@@ -165,7 +175,7 @@ inline std::vector<int64_t> compute_contiguous_strides(c10::IntArrayRef sizes) {
 // calculates the final concrete shape by also filling in at most one '-1'
 // dimension.
 inline std::vector<int64_t> infer_size(
-    standalone::c10::IntArrayRef shape,
+    executorch::backends::aoti::slim::c10::IntArrayRef shape,
     int64_t numel) {
   int64_t new_size = 1;
   std::optional<int64_t> infer_dim;
@@ -182,8 +192,8 @@ inline std::vector<int64_t> infer_size(
       result_shape.push_back(-1); // placeholder
     } else {
       STANDALONE_CHECK(shape[dim] >= 0, "invalid shape dimension ", shape[dim]);
-      overflowed |=
-          standalone::c10::mul_overflows(new_size, shape[dim], &new_size);
+      overflowed |= executorch::backends::aoti::slim::c10::mul_overflows(
+          new_size, shape[dim], &new_size);
       result_shape.push_back(shape[dim]);
     }
   }
@@ -207,9 +217,9 @@ inline std::vector<int64_t> infer_size(
 // If so, it returns the new strides
 // If not, it returns an empty optional
 inline std::optional<std::vector<int64_t>> compute_stride(
-    standalone::c10::IntArrayRef old_sizes,
-    standalone::c10::IntArrayRef old_strides,
-    standalone::c10::IntArrayRef new_sizes) {
+    executorch::backends::aoti::slim::c10::IntArrayRef old_sizes,
+    executorch::backends::aoti::slim::c10::IntArrayRef old_strides,
+    executorch::backends::aoti::slim::c10::IntArrayRef new_sizes) {
   if (old_sizes.empty()) {
     return std::vector<int64_t>(new_sizes.size(), 1);
   }
@@ -248,7 +258,7 @@ inline std::optional<std::vector<int64_t>> compute_stride(
        tensor_d--) {
     // TODO: ask if this could lead to overflow by any chance?
     // even if so, overflow is not handled in the aten implementation
-    overflowed |= standalone::c10::mul_overflows(
+    overflowed |= executorch::backends::aoti::slim::c10::mul_overflows(
         tensor_numel, old_sizes[tensor_d], &tensor_numel);
 
     bool is_chunk_end = (tensor_d == 0) ||
@@ -280,4 +290,4 @@ inline std::optional<std::vector<int64_t>> compute_stride(
   return new_strides;
 }
 
-} // namespace standalone::slim
+} // namespace executorch::backends::aoti::slim