diff --git a/backends/aoti/slim/c10/core/Contiguity.h b/backends/aoti/slim/c10/core/Contiguity.h index d5ff49561ab..80db87eb588 100644 --- a/backends/aoti/slim/c10/core/Contiguity.h +++ b/backends/aoti/slim/c10/core/Contiguity.h @@ -6,7 +6,7 @@ #include #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { template bool _compute_contiguous(ArrayRef sizes, ArrayRef strides, T numel) { @@ -148,4 +148,4 @@ bool _compute_non_overlapping_and_dense( return true; } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/core/Device.h b/backends/aoti/slim/c10/core/Device.h index a9a6d3a8136..02e88a30b1e 100644 --- a/backends/aoti/slim/c10/core/Device.h +++ b/backends/aoti/slim/c10/core/Device.h @@ -17,7 +17,7 @@ // Copied from c10/core/DeviceType.h with some modifications -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { namespace detail { enum class DeviceStringParsingState { kSTART, @@ -341,18 +341,21 @@ inline std::ostream& operator<<(std::ostream& stream, const Device& device) { stream << device.str(); return stream; } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 namespace std { template <> -struct hash { - size_t operator()(standalone::c10::Device d) const noexcept { +struct hash { + size_t operator()( + executorch::backends::aoti::slim::c10::Device d) const noexcept { // Are you here because this static assert failed? Make sure you ensure // that the bitmasking code below is updated accordingly! static_assert( - sizeof(standalone::c10::DeviceType) == 1, "DeviceType is not 8-bit"); + sizeof(executorch::backends::aoti::slim::c10::DeviceType) == 1, + "DeviceType is not 8-bit"); static_assert( - sizeof(standalone::c10::DeviceIndex) == 1, "DeviceIndex is not 8-bit"); + sizeof(executorch::backends::aoti::slim::c10::DeviceIndex) == 1, + "DeviceIndex is not 8-bit"); // Note [Hazard when concatenating signed integers] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // We must first convert to a same-sized unsigned type, before promoting to diff --git a/backends/aoti/slim/c10/core/DeviceType.h b/backends/aoti/slim/c10/core/DeviceType.h index f2631a48f2d..eb024a3595d 100644 --- a/backends/aoti/slim/c10/core/DeviceType.h +++ b/backends/aoti/slim/c10/core/DeviceType.h @@ -15,7 +15,7 @@ #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { enum class DeviceType : int8_t { CPU = 0, CUDA = 1, // CUDA. @@ -122,12 +122,13 @@ inline std::ostream& operator<<(std::ostream& stream, DeviceType type) { stream << DeviceTypeName(type, /* lower case */ true); return stream; } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 namespace std { template <> -struct hash { - std::size_t operator()(standalone::c10::DeviceType k) const { +struct hash { + std::size_t operator()( + executorch::backends::aoti::slim::c10::DeviceType k) const { return std::hash()(static_cast(k)); } }; diff --git a/backends/aoti/slim/c10/core/Layout.h b/backends/aoti/slim/c10/core/Layout.h index 79230f23bb7..4d7b5499088 100644 --- a/backends/aoti/slim/c10/core/Layout.h +++ b/backends/aoti/slim/c10/core/Layout.h @@ -5,7 +5,7 @@ #include #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { enum class Layout : int8_t { Strided, Sparse, @@ -50,4 +50,4 @@ inline std::ostream& operator<<(std::ostream& stream, c10::Layout layout) { } } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/core/MemoryFormat.h b/backends/aoti/slim/c10/core/MemoryFormat.h index 756caf64f26..68f1a6d7357 100644 --- a/backends/aoti/slim/c10/core/MemoryFormat.h +++ b/backends/aoti/slim/c10/core/MemoryFormat.h @@ -25,7 +25,7 @@ // Regardless of input tensors format, the output should be in channels_last // format. -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { enum class MemoryFormat : int8_t { Contiguous, Preserve, @@ -38,7 +38,7 @@ enum class MemoryFormat : int8_t { // the memory format could be preserved, and it was switched to old default // behaviour of contiguous #define LEGACY_CONTIGUOUS_MEMORY_FORMAT \ - ::standalone::c10::get_contiguous_memory_format() + ::executorch::backends::aoti::slim::c10::get_contiguous_memory_format() inline MemoryFormat get_contiguous_memory_format() { return MemoryFormat::Contiguous; @@ -288,4 +288,4 @@ inline bool is_channels_last_strides_3d( return is_channels_last_strides_3d(sizes, strides); } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/core/Scalar.h b/backends/aoti/slim/c10/core/Scalar.h index 1c61ecb4704..b46add34946 100644 --- a/backends/aoti/slim/c10/core/Scalar.h +++ b/backends/aoti/slim/c10/core/Scalar.h @@ -15,7 +15,7 @@ // Copy-pasted from c10/core/Scalar.h, but dropping SymScalar support -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { /** * Scalar represents a 0-dimensional tensor which contains a single element. @@ -86,22 +86,23 @@ class Scalar { v.i = convert(vv); } -#define DEFINE_ACCESSOR(type, name) \ - type to##name() const { \ - if (Tag::HAS_d == tag) { \ - return checked_convert(v.d, #type); \ - } else if (Tag::HAS_z == tag) { \ - return checked_convert>( \ - v.z, #type); \ - } \ - if (Tag::HAS_b == tag) { \ - return checked_convert(v.i, #type); \ - } else if (Tag::HAS_i == tag) { \ - return checked_convert(v.i, #type); \ - } else if (Tag::HAS_u == tag) { \ - return checked_convert(v.u, #type); \ - } \ - STANDALONE_CHECK(false) \ +#define DEFINE_ACCESSOR(type, name) \ + type to##name() const { \ + if (Tag::HAS_d == tag) { \ + return checked_convert(v.d, #type); \ + } else if (Tag::HAS_z == tag) { \ + return checked_convert< \ + type, \ + executorch::backends::aoti::slim::c10::complex>(v.z, #type); \ + } \ + if (Tag::HAS_b == tag) { \ + return checked_convert(v.i, #type); \ + } else if (Tag::HAS_i == tag) { \ + return checked_convert(v.i, #type); \ + } else if (Tag::HAS_u == tag) { \ + return checked_convert(v.u, #type); \ + } \ + STANDALONE_CHECK(false) \ } // TODO: Support ComplexHalf accessor @@ -193,8 +194,9 @@ class Scalar { template < typename T, - typename std::enable_if_t::value, int> = - 0> + typename std::enable_if_t< + !executorch::backends::aoti::slim::c10::is_complex::value, + int> = 0> bool equal(T num) const { if (isComplex()) { auto val = v.z; @@ -223,7 +225,9 @@ class Scalar { template < typename T, - typename std::enable_if_t::value, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_complex::value, + int> = 0> bool equal(T num) const { if (isComplex()) { return v.z == num; @@ -257,20 +261,20 @@ class Scalar { } } - standalone::c10::ScalarType type() const { + executorch::backends::aoti::slim::c10::ScalarType type() const { if (isComplex()) { - return standalone::c10::ScalarType::ComplexDouble; + return executorch::backends::aoti::slim::c10::ScalarType::ComplexDouble; } else if (isFloatingPoint()) { - return standalone::c10::ScalarType::Double; + return executorch::backends::aoti::slim::c10::ScalarType::Double; } else if (isIntegral(/*includeBool=*/false)) { // Represent all integers as long, UNLESS it is unsigned and therefore // unrepresentable as long if (Tag::HAS_u == tag) { - return standalone::c10::ScalarType::UInt64; + return executorch::backends::aoti::slim::c10::ScalarType::UInt64; } - return standalone::c10::ScalarType::Long; + return executorch::backends::aoti::slim::c10::ScalarType::Long; } else if (isBoolean()) { - return standalone::c10::ScalarType::Bool; + return executorch::backends::aoti::slim::c10::ScalarType::Bool; } else { throw std::runtime_error("Unknown scalar type."); } @@ -313,7 +317,7 @@ class Scalar { int64_t i; // See Note [Meaning of HAS_u] uint64_t u; - standalone::c10::complex z; + executorch::backends::aoti::slim::c10::complex z; // NOLINTNEXTLINE(modernize-use-equals-default) v_t() {} // default constructor } v; @@ -330,7 +334,8 @@ class Scalar { template < typename T, typename std::enable_if_t< - !std::is_integral_v && !standalone::c10::is_complex::value, + !std::is_integral_v && + !executorch::backends::aoti::slim::c10::is_complex::value, bool>* = nullptr> Scalar(T vv, bool) : tag(Tag::HAS_d) { v.d = convert(vv); @@ -338,8 +343,9 @@ class Scalar { template < typename T, - typename std::enable_if_t::value, bool>* = - nullptr> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_complex::value, + bool>* = nullptr> Scalar(T vv, bool) : tag(Tag::HAS_z) { v.z = convert(vv); } @@ -357,4 +363,4 @@ DEFINE_TO(uint32_t, UInt32) DEFINE_TO(uint64_t, UInt64) #undef DEFINE_TO -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/core/ScalarType.h b/backends/aoti/slim/c10/core/ScalarType.h index 6daeaad5f2c..6481b3d2c4b 100644 --- a/backends/aoti/slim/c10/core/ScalarType.h +++ b/backends/aoti/slim/c10/core/ScalarType.h @@ -26,7 +26,7 @@ #include #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { // dummy struct for uint1 to uint7, actual functionality // of these dtypes will be implemented in python with Tensor subclass @@ -60,53 +60,62 @@ struct dummy_int1_7_t {}; // NB: Order matters for this macro; it is relied upon in // _promoteTypesLookup and the serialization format. -#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(_) \ - _(uint8_t, Byte) /* 0 */ \ - _(int8_t, Char) /* 1 */ \ - _(int16_t, Short) /* 2 */ \ - _(int, Int) /* 3 */ \ - _(int64_t, Long) /* 4 */ \ - _(standalone::c10::Half, Half) /* 5 */ \ - _(float, Float) /* 6 */ \ - _(double, Double) /* 7 */ \ - _(standalone::c10::complex, ComplexHalf) /* 8 */ \ - _(standalone::c10::complex, ComplexFloat) /* 9 */ \ - _(standalone::c10::complex, ComplexDouble) /* 10 */ \ - _(bool, Bool) /* 11 */ \ - _(standalone::c10::qint8, QInt8) /* 12 */ \ - _(standalone::c10::quint8, QUInt8) /* 13 */ \ - _(standalone::c10::qint32, QInt32) /* 14 */ \ - _(standalone::c10::BFloat16, BFloat16) /* 15 */ \ - _(standalone::c10::quint4x2, QUInt4x2) /* 16 */ \ - _(standalone::c10::quint2x4, QUInt2x4) /* 17 */ \ - _(standalone::c10::bits1x8, Bits1x8) /* 18 */ \ - _(standalone::c10::bits2x4, Bits2x4) /* 19 */ \ - _(standalone::c10::bits4x2, Bits4x2) /* 20 */ \ - _(standalone::c10::bits8, Bits8) /* 21 */ \ - _(standalone::c10::bits16, Bits16) /* 22 */ \ - _(standalone::c10::Float8_e5m2, Float8_e5m2) /* 23 */ \ - _(standalone::c10::Float8_e4m3fn, Float8_e4m3fn) /* 24 */ \ - _(standalone::c10::Float8_e5m2fnuz, Float8_e5m2fnuz) /* 25 */ \ - _(standalone::c10::Float8_e4m3fnuz, Float8_e4m3fnuz) /* 26 */ \ - _(uint16_t, UInt16) /* 27 */ \ - _(uint32_t, UInt32) /* 28 */ \ - _(uint64_t, UInt64) /* 29 */ \ - _(standalone::c10::dummy_uint1_7_t<1>, UInt1) /* 30 */ \ - _(standalone::c10::dummy_uint1_7_t<2>, UInt2) /* 31 */ \ - _(standalone::c10::dummy_uint1_7_t<3>, UInt3) /* 32 */ \ - _(standalone::c10::dummy_uint1_7_t<4>, UInt4) /* 33 */ \ - _(standalone::c10::dummy_uint1_7_t<5>, UInt5) /* 34 */ \ - _(standalone::c10::dummy_uint1_7_t<6>, UInt6) /* 35 */ \ - _(standalone::c10::dummy_uint1_7_t<7>, UInt7) /* 36 */ \ - _(standalone::c10::dummy_int1_7_t<1>, Int1) /* 37 */ \ - _(standalone::c10::dummy_int1_7_t<2>, Int2) /* 38 */ \ - _(standalone::c10::dummy_int1_7_t<3>, Int3) /* 39 */ \ - _(standalone::c10::dummy_int1_7_t<4>, Int4) /* 40 */ \ - _(standalone::c10::dummy_int1_7_t<5>, Int5) /* 41 */ \ - _(standalone::c10::dummy_int1_7_t<6>, Int6) /* 42 */ \ - _(standalone::c10::dummy_int1_7_t<7>, Int7) /* 43 */ \ - _(standalone::c10::Float8_e8m0fnu, Float8_e8m0fnu) /* 44 */ \ - _(standalone::c10::Float4_e2m1fn_x2, Float4_e2m1fn_x2) /* 45 */ +#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(_) \ + _(uint8_t, Byte) /* 0 */ \ + _(int8_t, Char) /* 1 */ \ + _(int16_t, Short) /* 2 */ \ + _(int, Int) /* 3 */ \ + _(int64_t, Long) /* 4 */ \ + _(executorch::backends::aoti::slim::c10::Half, Half) /* 5 */ \ + _(float, Float) /* 6 */ \ + _(double, Double) /* 7 */ \ + _(executorch::backends::aoti::slim::c10::complex< \ + executorch::backends::aoti::slim::c10::Half>, \ + ComplexHalf) /* 8 */ \ + _(executorch::backends::aoti::slim::c10::complex, \ + ComplexFloat) /* 9 */ \ + _(executorch::backends::aoti::slim::c10::complex, \ + ComplexDouble) /* 10 */ \ + _(bool, Bool) /* 11 */ \ + _(executorch::backends::aoti::slim::c10::qint8, QInt8) /* 12 */ \ + _(executorch::backends::aoti::slim::c10::quint8, QUInt8) /* 13 */ \ + _(executorch::backends::aoti::slim::c10::qint32, QInt32) /* 14 */ \ + _(executorch::backends::aoti::slim::c10::BFloat16, BFloat16) /* 15 */ \ + _(executorch::backends::aoti::slim::c10::quint4x2, QUInt4x2) /* 16 */ \ + _(executorch::backends::aoti::slim::c10::quint2x4, QUInt2x4) /* 17 */ \ + _(executorch::backends::aoti::slim::c10::bits1x8, Bits1x8) /* 18 */ \ + _(executorch::backends::aoti::slim::c10::bits2x4, Bits2x4) /* 19 */ \ + _(executorch::backends::aoti::slim::c10::bits4x2, Bits4x2) /* 20 */ \ + _(executorch::backends::aoti::slim::c10::bits8, Bits8) /* 21 */ \ + _(executorch::backends::aoti::slim::c10::bits16, Bits16) /* 22 */ \ + _(executorch::backends::aoti::slim::c10::Float8_e5m2, Float8_e5m2) /* 23 */ \ + _(executorch::backends::aoti::slim::c10::Float8_e4m3fn, \ + Float8_e4m3fn) /* 24 */ \ + _(executorch::backends::aoti::slim::c10::Float8_e5m2fnuz, \ + Float8_e5m2fnuz) /* 25 */ \ + _(executorch::backends::aoti::slim::c10::Float8_e4m3fnuz, \ + Float8_e4m3fnuz) /* 26 */ \ + _(uint16_t, UInt16) /* 27 */ \ + _(uint32_t, UInt32) /* 28 */ \ + _(uint64_t, UInt64) /* 29 */ \ + _(executorch::backends::aoti::slim::c10::dummy_uint1_7_t<1>, UInt1) /* 30 */ \ + _(executorch::backends::aoti::slim::c10::dummy_uint1_7_t<2>, UInt2) /* 31 */ \ + _(executorch::backends::aoti::slim::c10::dummy_uint1_7_t<3>, UInt3) /* 32 */ \ + _(executorch::backends::aoti::slim::c10::dummy_uint1_7_t<4>, UInt4) /* 33 */ \ + _(executorch::backends::aoti::slim::c10::dummy_uint1_7_t<5>, UInt5) /* 34 */ \ + _(executorch::backends::aoti::slim::c10::dummy_uint1_7_t<6>, UInt6) /* 35 */ \ + _(executorch::backends::aoti::slim::c10::dummy_uint1_7_t<7>, UInt7) /* 36 */ \ + _(executorch::backends::aoti::slim::c10::dummy_int1_7_t<1>, Int1) /* 37 */ \ + _(executorch::backends::aoti::slim::c10::dummy_int1_7_t<2>, Int2) /* 38 */ \ + _(executorch::backends::aoti::slim::c10::dummy_int1_7_t<3>, Int3) /* 39 */ \ + _(executorch::backends::aoti::slim::c10::dummy_int1_7_t<4>, Int4) /* 40 */ \ + _(executorch::backends::aoti::slim::c10::dummy_int1_7_t<5>, Int5) /* 41 */ \ + _(executorch::backends::aoti::slim::c10::dummy_int1_7_t<6>, Int6) /* 42 */ \ + _(executorch::backends::aoti::slim::c10::dummy_int1_7_t<7>, Int7) /* 43 */ \ + _(executorch::backends::aoti::slim::c10::Float8_e8m0fnu, \ + Float8_e8m0fnu) /* 44 */ \ + _(executorch::backends::aoti::slim::c10::Float4_e2m1fn_x2, \ + Float4_e2m1fn_x2) /* 45 */ // If you want to support ComplexHalf for real, add ComplexHalf // into this macro (and change the name). But beware: convert() @@ -115,43 +124,45 @@ struct dummy_int1_7_t {}; // TODO: To add unsigned int types here, we must define accumulate type. // But uint8 currently accumulates into int64, so we would have to make // an inconsistent choice for the larger types. Difficult. -#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF_F8NZ(_) \ - _(uint8_t, Byte) \ - _(int8_t, Char) \ - _(int16_t, Short) \ - _(int, Int) \ - _(int64_t, Long) \ - _(standalone::c10::Half, Half) \ - _(float, Float) \ - _(double, Double) \ - _(standalone::c10::complex, ComplexFloat) \ - _(standalone::c10::complex, ComplexDouble) \ - _(bool, Bool) \ - _(standalone::c10::BFloat16, BFloat16) \ - _(standalone::c10::Float8_e5m2, Float8_e5m2) \ - _(standalone::c10::Float8_e4m3fn, Float8_e4m3fn) +#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF_F8NZ(_) \ + _(uint8_t, Byte) \ + _(int8_t, Char) \ + _(int16_t, Short) \ + _(int, Int) \ + _(int64_t, Long) \ + _(executorch::backends::aoti::slim::c10::Half, Half) \ + _(float, Float) \ + _(double, Double) \ + _(executorch::backends::aoti::slim::c10::complex, ComplexFloat) \ + _(executorch::backends::aoti::slim::c10::complex, ComplexDouble) \ + _(bool, Bool) \ + _(executorch::backends::aoti::slim::c10::BFloat16, BFloat16) \ + _(executorch::backends::aoti::slim::c10::Float8_e5m2, Float8_e5m2) \ + _(executorch::backends::aoti::slim::c10::Float8_e4m3fn, Float8_e4m3fn) // This macro controls many of our C++ APIs, including constructors // for Scalar as well as the data() and item() accessors on Tensor -#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(_) \ - _(uint8_t, Byte) \ - _(int8_t, Char) \ - _(int16_t, Short) \ - _(int, Int) \ - _(int64_t, Long) \ - _(standalone::c10::Half, Half) \ - _(float, Float) \ - _(double, Double) \ - _(standalone::c10::complex, ComplexHalf) \ - _(standalone::c10::complex, ComplexFloat) \ - _(standalone::c10::complex, ComplexDouble) \ - _(bool, Bool) \ - _(standalone::c10::BFloat16, BFloat16) \ - _(standalone::c10::Float8_e5m2, Float8_e5m2) \ - _(standalone::c10::Float8_e4m3fn, Float8_e4m3fn) \ - _(standalone::c10::Float8_e5m2fnuz, Float8_e5m2fnuz) \ - _(standalone::c10::Float8_e4m3fnuz, Float8_e4m3fnuz) \ - _(standalone::c10::Float8_e8m0fnu, Float8_e8m0fnu) +#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(_) \ + _(uint8_t, Byte) \ + _(int8_t, Char) \ + _(int16_t, Short) \ + _(int, Int) \ + _(int64_t, Long) \ + _(executorch::backends::aoti::slim::c10::Half, Half) \ + _(float, Float) \ + _(double, Double) \ + _(executorch::backends::aoti::slim::c10::complex< \ + executorch::backends::aoti::slim::c10::Half>, \ + ComplexHalf) \ + _(executorch::backends::aoti::slim::c10::complex, ComplexFloat) \ + _(executorch::backends::aoti::slim::c10::complex, ComplexDouble) \ + _(bool, Bool) \ + _(executorch::backends::aoti::slim::c10::BFloat16, BFloat16) \ + _(executorch::backends::aoti::slim::c10::Float8_e5m2, Float8_e5m2) \ + _(executorch::backends::aoti::slim::c10::Float8_e4m3fn, Float8_e4m3fn) \ + _(executorch::backends::aoti::slim::c10::Float8_e5m2fnuz, Float8_e5m2fnuz) \ + _(executorch::backends::aoti::slim::c10::Float8_e4m3fnuz, Float8_e4m3fnuz) \ + _(executorch::backends::aoti::slim::c10::Float8_e8m0fnu, Float8_e8m0fnu) enum class ScalarType : int8_t { #define DEFINE_ST_ENUM_VAL_(_1, n) n, @@ -168,19 +179,20 @@ namespace impl { // These are used to map ScalarTypes to C++ types. -template +template struct ScalarTypeToCPPType; #define SPECIALIZE_ScalarTypeToCPPType(cpp_type, scalar_type) \ template <> \ - struct ScalarTypeToCPPType { \ + struct ScalarTypeToCPPType< \ + executorch::backends::aoti::slim::c10::ScalarType::scalar_type> { \ using type = cpp_type; \ \ /* This is a workaround for the CUDA bug which prevents */ \ /* ::detail::ScalarTypeToCType::type being used directly due to */ \ /* ambiguous reference which can't to be resolved. For some reason it */ \ - /* can't pick between standalone::c10::detail and \ - * standalone::c10::cuda::detail. */ \ + /* can't pick between executorch::backends::aoti::slim::c10::detail and \ + * executorch::backends::aoti::slim::c10::cuda::detail. */ \ /* For repro example, please see: */ \ /* https://gist.github.com/izdeby/952ae7cf256ddb740a73776d39a7e7ba */ \ /* TODO: remove once the bug is fixed. */ \ @@ -191,7 +203,7 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_ScalarTypeToCPPType) #undef SPECIALIZE_ScalarTypeToCPPType -template +template using ScalarTypeToCPPTypeT = typename ScalarTypeToCPPType::type; } // namespace impl @@ -199,12 +211,13 @@ using ScalarTypeToCPPTypeT = typename ScalarTypeToCPPType::type; template struct CppTypeToScalarType; -#define SPECIALIZE_CppTypeToScalarType(cpp_type, scalar_type) \ - template <> \ - struct CppTypeToScalarType \ - : std::integral_constant< \ - standalone::c10::ScalarType, \ - standalone::c10::ScalarType::scalar_type> {}; +#define SPECIALIZE_CppTypeToScalarType(cpp_type, scalar_type) \ + template <> \ + struct CppTypeToScalarType \ + : std::integral_constant< \ + executorch::backends::aoti::slim::c10::ScalarType, \ + executorch::backends::aoti::slim::c10::ScalarType::scalar_type> { \ + }; AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType) @@ -233,106 +246,119 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType) // instead, new types should be added to use sites on a case-by-case basis. // We generally are not accepting new dtypes due to binary size concerns. -#define AT_FORALL_SCALAR_TYPES_AND(SCALARTYPE, _) \ - _(uint8_t, Byte) \ - _(int8_t, Char) \ - _(int16_t, Short) \ - _(int, Int) \ - _(int64_t, Long) \ - _(float, Float) \ - _(double, Double) \ - _(decltype(standalone::c10::impl::ScalarTypeToCPPType< \ - standalone::c10::ScalarType::SCALARTYPE>::t), \ +#define AT_FORALL_SCALAR_TYPES_AND(SCALARTYPE, _) \ + _(uint8_t, Byte) \ + _(int8_t, Char) \ + _(int16_t, Short) \ + _(int, Int) \ + _(int64_t, Long) \ + _(float, Float) \ + _(double, Double) \ + _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \ + executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE>:: \ + t), \ SCALARTYPE) -#define AT_FORALL_SCALAR_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \ - _(uint8_t, Byte) \ - _(int8_t, Char) \ - _(int16_t, Short) \ - _(int, Int) \ - _(int64_t, Long) \ - _(float, Float) \ - _(double, Double) \ - _(decltype(standalone::c10::impl::ScalarTypeToCPPType< \ - standalone::c10::ScalarType::SCALARTYPE1>::t), \ - SCALARTYPE1) \ - _(decltype(standalone::c10::impl::ScalarTypeToCPPType< \ - standalone::c10::ScalarType::SCALARTYPE2>::t), \ +#define AT_FORALL_SCALAR_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \ + _(uint8_t, Byte) \ + _(int8_t, Char) \ + _(int16_t, Short) \ + _(int, Int) \ + _(int64_t, Long) \ + _(float, Float) \ + _(double, Double) \ + _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \ + executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE1>:: \ + t), \ + SCALARTYPE1) \ + _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \ + executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE2>:: \ + t), \ SCALARTYPE2) -#define AT_FORALL_SCALAR_TYPES_AND3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, _) \ - _(uint8_t, Byte) \ - _(int8_t, Char) \ - _(int16_t, Short) \ - _(int, Int) \ - _(int64_t, Long) \ - _(float, Float) \ - _(double, Double) \ - _(decltype(standalone::c10::impl::ScalarTypeToCPPType< \ - standalone::c10::ScalarType::SCALARTYPE1>::t), \ - SCALARTYPE1) \ - _(decltype(standalone::c10::impl::ScalarTypeToCPPType< \ - standalone::c10::ScalarType::SCALARTYPE2>::t), \ - SCALARTYPE2) \ - _(decltype(standalone::c10::impl::ScalarTypeToCPPType< \ - standalone::c10::ScalarType::SCALARTYPE3>::t), \ +#define AT_FORALL_SCALAR_TYPES_AND3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, _) \ + _(uint8_t, Byte) \ + _(int8_t, Char) \ + _(int16_t, Short) \ + _(int, Int) \ + _(int64_t, Long) \ + _(float, Float) \ + _(double, Double) \ + _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \ + executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE1>:: \ + t), \ + SCALARTYPE1) \ + _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \ + executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE2>:: \ + t), \ + SCALARTYPE2) \ + _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \ + executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE3>:: \ + t), \ SCALARTYPE3) -#define AT_FORALL_SCALAR_TYPES_AND7( \ - SCALARTYPE1, \ - SCALARTYPE2, \ - SCALARTYPE3, \ - SCALARTYPE4, \ - SCALARTYPE5, \ - SCALARTYPE6, \ - SCALARTYPE7, \ - _) \ - _(uint8_t, Byte) \ - _(int8_t, Char) \ - _(int16_t, Short) \ - _(int, Int) \ - _(int64_t, Long) \ - _(float, Float) \ - _(double, Double) \ - _(decltype(standalone::c10::impl::ScalarTypeToCPPType< \ - standalone::c10::ScalarType::SCALARTYPE1>::t), \ - SCALARTYPE1) \ - _(decltype(standalone::c10::impl::ScalarTypeToCPPType< \ - standalone::c10::ScalarType::SCALARTYPE2>::t), \ - SCALARTYPE2) \ - _(decltype(standalone::c10::impl::ScalarTypeToCPPType< \ - standalone::c10::ScalarType::SCALARTYPE3>::t), \ - SCALARTYPE3) \ - _(decltype(standalone::c10::impl::ScalarTypeToCPPType< \ - standalone::c10::ScalarType::SCALARTYPE4>::t), \ - SCALARTYPE4) \ - _(decltype(standalone::c10::impl::ScalarTypeToCPPType< \ - standalone::c10::ScalarType::SCALARTYPE5>::t), \ - SCALARTYPE5) \ - _(decltype(standalone::c10::impl::ScalarTypeToCPPType< \ - standalone::c10::ScalarType::SCALARTYPE6>::t), \ - SCALARTYPE6) \ - _(decltype(standalone::c10::impl::ScalarTypeToCPPType< \ - standalone::c10::ScalarType::SCALARTYPE7>::t), \ +#define AT_FORALL_SCALAR_TYPES_AND7( \ + SCALARTYPE1, \ + SCALARTYPE2, \ + SCALARTYPE3, \ + SCALARTYPE4, \ + SCALARTYPE5, \ + SCALARTYPE6, \ + SCALARTYPE7, \ + _) \ + _(uint8_t, Byte) \ + _(int8_t, Char) \ + _(int16_t, Short) \ + _(int, Int) \ + _(int64_t, Long) \ + _(float, Float) \ + _(double, Double) \ + _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \ + executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE1>:: \ + t), \ + SCALARTYPE1) \ + _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \ + executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE2>:: \ + t), \ + SCALARTYPE2) \ + _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \ + executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE3>:: \ + t), \ + SCALARTYPE3) \ + _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \ + executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE4>:: \ + t), \ + SCALARTYPE4) \ + _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \ + executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE5>:: \ + t), \ + SCALARTYPE5) \ + _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \ + executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE6>:: \ + t), \ + SCALARTYPE6) \ + _(decltype(executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPType< \ + executorch::backends::aoti::slim::c10::ScalarType::SCALARTYPE7>:: \ + t), \ SCALARTYPE7) -#define AT_FORALL_QINT_TYPES(_) \ - _(standalone::c10::qint8, QInt8) \ - _(standalone::c10::quint8, QUInt8) \ - _(standalone::c10::qint32, QInt32) \ - _(standalone::c10::quint4x2, QUInt4x2) \ - _(standalone::c10::quint2x4, QUInt2x4) +#define AT_FORALL_QINT_TYPES(_) \ + _(executorch::backends::aoti::slim::c10::qint8, QInt8) \ + _(executorch::backends::aoti::slim::c10::quint8, QUInt8) \ + _(executorch::backends::aoti::slim::c10::qint32, QInt32) \ + _(executorch::backends::aoti::slim::c10::quint4x2, QUInt4x2) \ + _(executorch::backends::aoti::slim::c10::quint2x4, QUInt2x4) -#define AT_FORALL_FLOAT8_TYPES(_) \ - _(standalone::c10::Float8_e5m2, Float8_e5m2) \ - _(standalone::c10::Float8_e4m3fn, Float8_e4m3fn) \ - _(standalone::c10::Float8_e5m2fnuz, Float8_e5m2fnuz) \ - _(standalone::c10::Float8_e4m3fnuz, Float8_e4m3fnuz) \ - _(standalone::c10::Float8_e8m0fnu, Float8_e8m0fnu) +#define AT_FORALL_FLOAT8_TYPES(_) \ + _(executorch::backends::aoti::slim::c10::Float8_e5m2, Float8_e5m2) \ + _(executorch::backends::aoti::slim::c10::Float8_e4m3fn, Float8_e4m3fn) \ + _(executorch::backends::aoti::slim::c10::Float8_e5m2fnuz, Float8_e5m2fnuz) \ + _(executorch::backends::aoti::slim::c10::Float8_e4m3fnuz, Float8_e4m3fnuz) \ + _(executorch::backends::aoti::slim::c10::Float8_e8m0fnu, Float8_e8m0fnu) -#define AT_FORALL_COMPLEX_TYPES(_) \ - _(standalone::c10::complex, ComplexFloat) \ - _(standalone::c10::complex, ComplexDouble) +#define AT_FORALL_COMPLEX_TYPES(_) \ + _(executorch::backends::aoti::slim::c10::complex, ComplexFloat) \ + _(executorch::backends::aoti::slim::c10::complex, ComplexDouble) #define DEFINE_CONSTANT(_, name) \ constexpr ScalarType k##name = ScalarType::name; @@ -450,10 +476,11 @@ inline ScalarType toUnderlying(ScalarType t) { } inline bool isSignedType(ScalarType t) { -#define CASE_ISSIGNED(name) \ - case ScalarType::name: \ - return std::numeric_limits<::standalone::c10::impl::ScalarTypeToCPPTypeT< \ - ScalarType::name>>::is_signed; +#define CASE_ISSIGNED(name) \ + case ScalarType::name: \ + return std::numeric_limits< \ + ::executorch::backends::aoti::slim::c10::impl::ScalarTypeToCPPTypeT< \ + ScalarType::name>>::is_signed; // TODO(#146647): If we expect to have numeric_limits for everything, // let's just have a big macro for the whole thing. @@ -605,20 +632,21 @@ constexpr auto b1 = ScalarType::Bool; constexpr auto bf = ScalarType::BFloat16; constexpr auto ud = ScalarType::Undefined; -constexpr auto index2dtype = array_of( - u1, - i1, - i2, - i4, - i8, - f2, - f4, - f8, - c2, - c4, - c8, - b1, - bf); +constexpr auto index2dtype = + array_of( + u1, + i1, + i2, + i4, + i8, + f2, + f4, + f8, + c2, + c4, + c8, + b1, + bf); constexpr std::array(ScalarType::NumOptions)> calculate_dtype2index() { @@ -728,8 +756,8 @@ inline ScalarType promoteTypes(ScalarType a, ScalarType b) { inline std::ostream& operator<<( std::ostream& stream, - standalone::c10::ScalarType scalar_type) { + executorch::backends::aoti::slim::c10::ScalarType scalar_type) { return stream << toString(scalar_type); } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/core/SizesAndStrides.h b/backends/aoti/slim/c10/core/SizesAndStrides.h index aef0ddab171..0b9edaccde7 100644 --- a/backends/aoti/slim/c10/core/SizesAndStrides.h +++ b/backends/aoti/slim/c10/core/SizesAndStrides.h @@ -10,7 +10,7 @@ #define STANDALONE_SIZES_AND_STRIDES_MAX_INLINE_SIZE 5 -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { // Packed container for TensorImpl sizes and strides. // This design improves on the previous approach of using a pair of @@ -399,4 +399,4 @@ class SizesAndStrides { }; }; -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/core/WrapDimMinimal.h b/backends/aoti/slim/c10/core/WrapDimMinimal.h index 651421e6d89..68c80a4abc3 100644 --- a/backends/aoti/slim/c10/core/WrapDimMinimal.h +++ b/backends/aoti/slim/c10/core/WrapDimMinimal.h @@ -7,7 +7,7 @@ // Different from the original implementation in c10, we don't need // to support SymInt here. -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { namespace detail { template T maybe_wrap_dim_slow(T dim, T dim_post_expr, bool wrap_scalar); @@ -25,7 +25,7 @@ T _maybe_wrap_dim(T dim, T dim_post_expr, bool wrap_scalar = true) { return dim; } // Check edge-cases out-of-line (wrapping scalars and out-of-bounds errors) - return standalone::c10::detail::maybe_wrap_dim_slow( + return executorch::backends::aoti::slim::c10::detail::maybe_wrap_dim_slow( std::move(dim), std::move(dim_post_expr), wrap_scalar); } @@ -48,7 +48,7 @@ T maybe_wrap_dim_slow(T dim, T dim_post_expr, bool wrap_scalar) { "Dimension specified as ", dim, " but tensor has no dimensions"); - return standalone::c10::maybe_wrap_dim( + return executorch::backends::aoti::slim::c10::maybe_wrap_dim( std::move(dim), /*dim_post_expr=*/1, /*wrap_scalar=*/false); @@ -70,4 +70,4 @@ T maybe_wrap_dim_slow(T dim, T dim_post_expr, bool wrap_scalar) { false, "should never reach here as dim should be out-of-bounds"); } } // namespace detail -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/util/Array.h b/backends/aoti/slim/c10/util/Array.h index 39eabc830d1..d093d26c51a 100644 --- a/backends/aoti/slim/c10/util/Array.h +++ b/backends/aoti/slim/c10/util/Array.h @@ -3,7 +3,7 @@ #include #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { // This helper function creates a constexpr std::array // From a compile time list of values, without requiring you to explicitly @@ -15,4 +15,4 @@ inline constexpr auto array_of(T&&... t) -> std::array { return {{std::forward(t)...}}; } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/util/ArrayRef.h b/backends/aoti/slim/c10/util/ArrayRef.h index 4a09f7a9335..9c7c6cd781d 100644 --- a/backends/aoti/slim/c10/util/ArrayRef.h +++ b/backends/aoti/slim/c10/util/ArrayRef.h @@ -29,7 +29,7 @@ #include #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { /// ArrayRef - Represent a constant reference to an array (0 or more elements /// consecutively in memory), i.e. a start pointer and a length. It allows /// various APIs to take consecutive elements easily and conveniently. @@ -324,41 +324,49 @@ ArrayRef makeArrayRef(const T (&Arr)[N]) { } // WARNING: Template instantiation will NOT be willing to do an implicit -// conversions to get you to an standalone::c10::ArrayRef, which is why we -// need so many overloads. +// conversions to get you to an executorch::backends::aoti::slim::c10::ArrayRef, +// which is why we need so many overloads. template bool operator==( - standalone::c10::ArrayRef a1, - standalone::c10::ArrayRef a2) { + executorch::backends::aoti::slim::c10::ArrayRef a1, + executorch::backends::aoti::slim::c10::ArrayRef a2) { return a1.equals(a2); } template bool operator!=( - standalone::c10::ArrayRef a1, - standalone::c10::ArrayRef a2) { + executorch::backends::aoti::slim::c10::ArrayRef a1, + executorch::backends::aoti::slim::c10::ArrayRef a2) { return !a1.equals(a2); } template -bool operator==(const std::vector& a1, standalone::c10::ArrayRef a2) { - return standalone::c10::ArrayRef(a1).equals(a2); +bool operator==( + const std::vector& a1, + executorch::backends::aoti::slim::c10::ArrayRef a2) { + return executorch::backends::aoti::slim::c10::ArrayRef(a1).equals(a2); } template -bool operator!=(const std::vector& a1, standalone::c10::ArrayRef a2) { - return !standalone::c10::ArrayRef(a1).equals(a2); +bool operator!=( + const std::vector& a1, + executorch::backends::aoti::slim::c10::ArrayRef a2) { + return !executorch::backends::aoti::slim::c10::ArrayRef(a1).equals(a2); } template -bool operator==(standalone::c10::ArrayRef a1, const std::vector& a2) { - return a1.equals(standalone::c10::ArrayRef(a2)); +bool operator==( + executorch::backends::aoti::slim::c10::ArrayRef a1, + const std::vector& a2) { + return a1.equals(executorch::backends::aoti::slim::c10::ArrayRef(a2)); } template -bool operator!=(standalone::c10::ArrayRef a1, const std::vector& a2) { - return !a1.equals(standalone::c10::ArrayRef(a2)); +bool operator!=( + executorch::backends::aoti::slim::c10::ArrayRef a1, + const std::vector& a2) { + return !a1.equals(executorch::backends::aoti::slim::c10::ArrayRef(a2)); } using IntArrayRef = ArrayRef; @@ -368,4 +376,4 @@ using IntList "semantics obvious. Use IntArrayRef instead!")]] = ArrayRef; -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/util/BFloat16-inl.h b/backends/aoti/slim/c10/util/BFloat16-inl.h index 4608d9a6c54..5c41d4aaad0 100644 --- a/backends/aoti/slim/c10/util/BFloat16-inl.h +++ b/backends/aoti/slim/c10/util/BFloat16-inl.h @@ -16,7 +16,7 @@ STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion") #include // for SYCL 2020 #endif -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { /// Constructors inline STANDALONE_HOST_DEVICE BFloat16::BFloat16(float value) @@ -26,7 +26,8 @@ inline STANDALONE_HOST_DEVICE BFloat16::BFloat16(float value) x(__bfloat16_as_ushort(__float2bfloat16(value))) #elif defined(__SYCL_DEVICE_ONLY__) && \ defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS) - x(standalone::c10::bit_cast(sycl::ext::oneapi::bfloat16(value))) + x(executorch::backends::aoti::slim::c10::bit_cast( + sycl::ext::oneapi::bfloat16(value))) #else // RNE by default x(detail::round_to_nearest_even(value)) @@ -289,12 +290,12 @@ inline STANDALONE_HOST_DEVICE bool operator<(BFloat16& lhs, BFloat16& rhs) { return float(lhs) < float(rhs); } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 namespace std { template <> -class numeric_limits { +class numeric_limits { public: static constexpr bool is_signed = true; static constexpr bool is_specialized = true; @@ -322,41 +323,44 @@ class numeric_limits { static constexpr auto tinyness_before = numeric_limits::tinyness_before; - static constexpr standalone::c10::BFloat16 min() { - return standalone::c10::BFloat16( - 0x0080, standalone::c10::BFloat16::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::BFloat16 min() { + return executorch::backends::aoti::slim::c10::BFloat16( + 0x0080, executorch::backends::aoti::slim::c10::BFloat16::from_bits()); } - static constexpr standalone::c10::BFloat16 lowest() { - return standalone::c10::BFloat16( - 0xFF7F, standalone::c10::BFloat16::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::BFloat16 lowest() { + return executorch::backends::aoti::slim::c10::BFloat16( + 0xFF7F, executorch::backends::aoti::slim::c10::BFloat16::from_bits()); } - static constexpr standalone::c10::BFloat16 max() { - return standalone::c10::BFloat16( - 0x7F7F, standalone::c10::BFloat16::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::BFloat16 max() { + return executorch::backends::aoti::slim::c10::BFloat16( + 0x7F7F, executorch::backends::aoti::slim::c10::BFloat16::from_bits()); } - static constexpr standalone::c10::BFloat16 epsilon() { - return standalone::c10::BFloat16( - 0x3C00, standalone::c10::BFloat16::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::BFloat16 epsilon() { + return executorch::backends::aoti::slim::c10::BFloat16( + 0x3C00, executorch::backends::aoti::slim::c10::BFloat16::from_bits()); } - static constexpr standalone::c10::BFloat16 round_error() { - return standalone::c10::BFloat16( - 0x3F00, standalone::c10::BFloat16::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::BFloat16 + round_error() { + return executorch::backends::aoti::slim::c10::BFloat16( + 0x3F00, executorch::backends::aoti::slim::c10::BFloat16::from_bits()); } - static constexpr standalone::c10::BFloat16 infinity() { - return standalone::c10::BFloat16( - 0x7F80, standalone::c10::BFloat16::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::BFloat16 infinity() { + return executorch::backends::aoti::slim::c10::BFloat16( + 0x7F80, executorch::backends::aoti::slim::c10::BFloat16::from_bits()); } - static constexpr standalone::c10::BFloat16 quiet_NaN() { - return standalone::c10::BFloat16( - 0x7FC0, standalone::c10::BFloat16::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::BFloat16 quiet_NaN() { + return executorch::backends::aoti::slim::c10::BFloat16( + 0x7FC0, executorch::backends::aoti::slim::c10::BFloat16::from_bits()); } - static constexpr standalone::c10::BFloat16 signaling_NaN() { - return standalone::c10::BFloat16( - 0x7F80, standalone::c10::BFloat16::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::BFloat16 + signaling_NaN() { + return executorch::backends::aoti::slim::c10::BFloat16( + 0x7F80, executorch::backends::aoti::slim::c10::BFloat16::from_bits()); } - static constexpr standalone::c10::BFloat16 denorm_min() { - return standalone::c10::BFloat16( - 0x0001, standalone::c10::BFloat16::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::BFloat16 + denorm_min() { + return executorch::backends::aoti::slim::c10::BFloat16( + 0x0001, executorch::backends::aoti::slim::c10::BFloat16::from_bits()); } }; diff --git a/backends/aoti/slim/c10/util/BFloat16-math.h b/backends/aoti/slim/c10/util/BFloat16-math.h index f036f309e26..ad67d81fa23 100644 --- a/backends/aoti/slim/c10/util/BFloat16-math.h +++ b/backends/aoti/slim/c10/util/BFloat16-math.h @@ -8,243 +8,276 @@ STANDALONE_CLANG_DIAGNOSTIC_PUSH() STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion") #endif -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { template struct is_reduced_floating_point : std::integral_constant< bool, - std::is_same_v || - std::is_same_v> {}; + std::is_same_v || + std::is_same_v< + T, + executorch::backends::aoti::slim::c10::BFloat16>> {}; template constexpr bool is_reduced_floating_point_v = is_reduced_floating_point::value; -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 namespace std { #if !defined(FBCODE_CAFFE2) && !defined(STANDALONE_NODEPRECATED) -using standalone::c10::is_reduced_floating_point; -using standalone::c10::is_reduced_floating_point_v; +using executorch::backends::aoti::slim::c10::is_reduced_floating_point; +using executorch::backends::aoti::slim::c10::is_reduced_floating_point_v; #endif // !defined(FBCODE_CAFFE2) && !defined(STANDALONE_NODEPRECATED) template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T acos(T a) { return std::acos(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T asin(T a) { return std::asin(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T atan(T a) { return std::atan(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T atanh(T a) { return std::atanh(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T erf(T a) { return std::erf(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T erfc(T a) { return std::erfc(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T exp(T a) { return std::exp(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T expm1(T a) { return std::expm1(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline bool isfinite(T a) { return std::isfinite(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T log(T a) { return std::log(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T log10(T a) { return std::log10(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T log1p(T a) { return std::log1p(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T log2(T a) { return std::log2(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T ceil(T a) { return std::ceil(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T cos(T a) { return std::cos(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T floor(T a) { return std::floor(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T nearbyint(T a) { return std::nearbyint(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T sin(T a) { return std::sin(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T tan(T a) { return std::tan(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T sinh(T a) { return std::sinh(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T cosh(T a) { return std::cosh(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T tanh(T a) { return std::tanh(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T trunc(T a) { return std::trunc(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T lgamma(T a) { return std::lgamma(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T sqrt(T a) { return std::sqrt(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T rsqrt(T a) { return 1.0 / std::sqrt(float(a)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T abs(T a) { return std::abs(float(a)); } #if defined(_MSC_VER) && defined(__CUDACC__) template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T pow(T a, double b) { return std::pow(float(a), float(b)); } #else template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T pow(T a, double b) { return std::pow(float(a), b); } #endif template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T pow(T a, T b) { return std::pow(float(a), float(b)); } template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> inline T fmod(T a, T b) { return std::fmod(float(a), float(b)); } @@ -277,8 +310,9 @@ inline T fmod(T a, T b) { */ template < typename T, - typename std:: - enable_if_t, int> = 0> + typename std::enable_if_t< + executorch::backends::aoti::slim::c10::is_reduced_floating_point_v, + int> = 0> STANDALONE_HOST_DEVICE inline T nextafter(T from, T to) { // Reference: // https://git.musl-libc.org/cgit/musl/tree/src/math/nextafter.c diff --git a/backends/aoti/slim/c10/util/BFloat16.h b/backends/aoti/slim/c10/util/BFloat16.h index ed6d07f53d0..d1b2a5baeb2 100644 --- a/backends/aoti/slim/c10/util/BFloat16.h +++ b/backends/aoti/slim/c10/util/BFloat16.h @@ -20,7 +20,7 @@ #include // for SYCL 2020 #endif -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { namespace detail { inline STANDALONE_HOST_DEVICE float f32_from_bits(uint16_t src) { @@ -118,6 +118,6 @@ inline std::ostream& operator<<(std::ostream& out, const BFloat16& value) { return out; } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 #include // IWYU pragma: keep diff --git a/backends/aoti/slim/c10/util/Exception.h b/backends/aoti/slim/c10/util/Exception.h index 6ab2bd8aae6..f83bf3f074a 100644 --- a/backends/aoti/slim/c10/util/Exception.h +++ b/backends/aoti/slim/c10/util/Exception.h @@ -6,8 +6,8 @@ #include // In the standalone version, STANDALONE_CHECK throws std::runtime_error -// instead of standalone::c10::Error. -namespace standalone::c10::detail { +// instead of executorch::backends::aoti::slim::c10::Error. +namespace executorch::backends::aoti::slim::c10::detail { template std::string torchCheckMsgImpl(const char* /*msg*/, const Args&... args) { // This is similar to the one in c10/util/Exception.h, but does @@ -25,14 +25,14 @@ inline const char* torchCheckMsgImpl(const char* msg) { inline const char* torchCheckMsgImpl(const char* /*msg*/, const char* args) { return args; } -} // namespace standalone::c10::detail +} // namespace executorch::backends::aoti::slim::c10::detail -#define STANDALONE_CHECK_MSG(cond, type, ...) \ - (::standalone::c10::detail::torchCheckMsgImpl( \ - "Expected " #cond \ - " to be true, but got false. " \ - "(Could this error message be improved? If so, " \ - "please report an enhancement request to PyTorch.)", \ +#define STANDALONE_CHECK_MSG(cond, type, ...) \ + (::executorch::backends::aoti::slim::c10::detail::torchCheckMsgImpl( \ + "Expected " #cond \ + " to be true, but got false. " \ + "(Could this error message be improved? If so, " \ + "please report an enhancement request to PyTorch.)", \ ##__VA_ARGS__)) #define STANDALONE_CHECK(cond, ...) \ if (STANDALONE_UNLIKELY_OR_CONST(!(cond))) { \ @@ -63,8 +63,9 @@ inline const char* torchCheckMsgImpl(const char* /*msg*/, const char* args) { ##__VA_ARGS__)); \ } -#define WARNING_MESSAGE_STRING(...) \ - ::standalone::c10::detail::torchCheckMsgImpl(__VA_ARGS__) +#define WARNING_MESSAGE_STRING(...) \ + ::executorch::backends::aoti::slim::c10::detail::torchCheckMsgImpl( \ + __VA_ARGS__) #ifdef DISABLE_WARN #define _STANDALONE_WARN_WITH(...) ((void)0); diff --git a/backends/aoti/slim/c10/util/Float4_e2m1fn_x2.h b/backends/aoti/slim/c10/util/Float4_e2m1fn_x2.h index 600e281b583..182163b9ca2 100644 --- a/backends/aoti/slim/c10/util/Float4_e2m1fn_x2.h +++ b/backends/aoti/slim/c10/util/Float4_e2m1fn_x2.h @@ -17,7 +17,7 @@ /// sign/exponent/mantissa | seem : seem /// -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { struct alignas(1) Float4_e2m1fn_x2 { uint8_t val_; @@ -25,4 +25,4 @@ struct alignas(1) Float4_e2m1fn_x2 { STANDALONE_HOST_DEVICE explicit Float4_e2m1fn_x2(uint8_t val) : val_(val) {} }; -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/util/Float8_e4m3fn-inl.h b/backends/aoti/slim/c10/util/Float8_e4m3fn-inl.h index cc31b82e699..a0cb1db2888 100644 --- a/backends/aoti/slim/c10/util/Float8_e4m3fn-inl.h +++ b/backends/aoti/slim/c10/util/Float8_e4m3fn-inl.h @@ -9,7 +9,7 @@ STANDALONE_CLANG_DIAGNOSTIC_PUSH() STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion") #endif -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { /// Constructors @@ -229,14 +229,15 @@ operator/(int64_t a, Float8_e4m3fn b) { } /// NOTE: we do not define comparisons directly and instead rely on the implicit -/// conversion from standalone::c10::Float8_e4m3fn to float. +/// conversion from executorch::backends::aoti::slim::c10::Float8_e4m3fn to +/// float. -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 namespace std { template <> -class numeric_limits { +class numeric_limits { public: static constexpr bool is_specialized = true; static constexpr bool is_signed = true; @@ -262,33 +263,45 @@ class numeric_limits { static constexpr auto traps = numeric_limits::traps; static constexpr auto tinyness_before = false; - static constexpr standalone::c10::Float8_e4m3fn min() { - return standalone::c10::Float8_e4m3fn( - 0x08, standalone::c10::Float8_e4m3fn::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fn min() { + return executorch::backends::aoti::slim::c10::Float8_e4m3fn( + 0x08, + executorch::backends::aoti::slim::c10::Float8_e4m3fn::from_bits()); } - static constexpr standalone::c10::Float8_e4m3fn lowest() { - return standalone::c10::Float8_e4m3fn( - 0xFE, standalone::c10::Float8_e4m3fn::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fn + lowest() { + return executorch::backends::aoti::slim::c10::Float8_e4m3fn( + 0xFE, + executorch::backends::aoti::slim::c10::Float8_e4m3fn::from_bits()); } - static constexpr standalone::c10::Float8_e4m3fn max() { - return standalone::c10::Float8_e4m3fn( - 0x7E, standalone::c10::Float8_e4m3fn::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fn max() { + return executorch::backends::aoti::slim::c10::Float8_e4m3fn( + 0x7E, + executorch::backends::aoti::slim::c10::Float8_e4m3fn::from_bits()); } - static constexpr standalone::c10::Float8_e4m3fn epsilon() { - return standalone::c10::Float8_e4m3fn( - 0x20, standalone::c10::Float8_e4m3fn::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fn + epsilon() { + return executorch::backends::aoti::slim::c10::Float8_e4m3fn( + 0x20, + executorch::backends::aoti::slim::c10::Float8_e4m3fn::from_bits()); } - static constexpr standalone::c10::Float8_e4m3fn round_error() { - return standalone::c10::Float8_e4m3fn( - 0x30, standalone::c10::Float8_e4m3fn::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fn + round_error() { + return executorch::backends::aoti::slim::c10::Float8_e4m3fn( + 0x30, + executorch::backends::aoti::slim::c10::Float8_e4m3fn::from_bits()); } - static constexpr standalone::c10::Float8_e4m3fn quiet_NaN() { - return standalone::c10::Float8_e4m3fn( - 0x7F, standalone::c10::Float8_e4m3fn::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fn + quiet_NaN() { + return executorch::backends::aoti::slim::c10::Float8_e4m3fn( + 0x7F, + executorch::backends::aoti::slim::c10::Float8_e4m3fn::from_bits()); } - static constexpr standalone::c10::Float8_e4m3fn denorm_min() { - return standalone::c10::Float8_e4m3fn( - 0x01, standalone::c10::Float8_e4m3fn::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fn + denorm_min() { + return executorch::backends::aoti::slim::c10::Float8_e4m3fn( + 0x01, + executorch::backends::aoti::slim::c10::Float8_e4m3fn::from_bits()); } }; diff --git a/backends/aoti/slim/c10/util/Float8_e4m3fn.h b/backends/aoti/slim/c10/util/Float8_e4m3fn.h index 320a677cbbb..22118007289 100644 --- a/backends/aoti/slim/c10/util/Float8_e4m3fn.h +++ b/backends/aoti/slim/c10/util/Float8_e4m3fn.h @@ -32,7 +32,7 @@ #include #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { namespace detail { @@ -233,6 +233,6 @@ inline std::ostream& operator<<(std::ostream& out, const Float8_e4m3fn& value) { return out; } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 #include // IWYU pragma: keep diff --git a/backends/aoti/slim/c10/util/Float8_e4m3fnuz-inl.h b/backends/aoti/slim/c10/util/Float8_e4m3fnuz-inl.h index 55a6ce73972..51f7c017504 100644 --- a/backends/aoti/slim/c10/util/Float8_e4m3fnuz-inl.h +++ b/backends/aoti/slim/c10/util/Float8_e4m3fnuz-inl.h @@ -10,7 +10,7 @@ STANDALONE_CLANG_DIAGNOSTIC_PUSH() STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion") #endif -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { /// Constructors @@ -239,14 +239,15 @@ operator/(int64_t a, Float8_e4m3fnuz b) { } /// NOTE: we do not define comparisons directly and instead rely on the implicit -/// conversion from standalone::c10::Float8_e4m3fnuz to float. +/// conversion from executorch::backends::aoti::slim::c10::Float8_e4m3fnuz to +/// float. -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 namespace std { template <> -class numeric_limits { +class numeric_limits { public: static constexpr bool is_specialized = true; static constexpr bool is_signed = true; @@ -272,38 +273,54 @@ class numeric_limits { static constexpr auto traps = numeric_limits::traps; static constexpr auto tinyness_before = false; - static constexpr standalone::c10::Float8_e4m3fnuz min() { - return standalone::c10::Float8_e4m3fnuz( - 0x08, standalone::c10::Float8_e4m3fnuz::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fnuz + min() { + return executorch::backends::aoti::slim::c10::Float8_e4m3fnuz( + 0x08, + executorch::backends::aoti::slim::c10::Float8_e4m3fnuz::from_bits()); } - static constexpr standalone::c10::Float8_e4m3fnuz lowest() { - return standalone::c10::Float8_e4m3fnuz( - 0xFF, standalone::c10::Float8_e4m3fnuz::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fnuz + lowest() { + return executorch::backends::aoti::slim::c10::Float8_e4m3fnuz( + 0xFF, + executorch::backends::aoti::slim::c10::Float8_e4m3fnuz::from_bits()); } - static constexpr standalone::c10::Float8_e4m3fnuz max() { - return standalone::c10::Float8_e4m3fnuz( - 0x7F, standalone::c10::Float8_e4m3fnuz::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fnuz + max() { + return executorch::backends::aoti::slim::c10::Float8_e4m3fnuz( + 0x7F, + executorch::backends::aoti::slim::c10::Float8_e4m3fnuz::from_bits()); } - static constexpr standalone::c10::Float8_e4m3fnuz epsilon() { - return standalone::c10::Float8_e4m3fnuz( - 0x28, standalone::c10::Float8_e4m3fnuz::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fnuz + epsilon() { + return executorch::backends::aoti::slim::c10::Float8_e4m3fnuz( + 0x28, + executorch::backends::aoti::slim::c10::Float8_e4m3fnuz::from_bits()); } - static constexpr standalone::c10::Float8_e4m3fnuz round_error() { - return standalone::c10::Float8_e4m3fnuz( - 0x38, standalone::c10::Float8_e4m3fnuz::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fnuz + round_error() { + return executorch::backends::aoti::slim::c10::Float8_e4m3fnuz( + 0x38, + executorch::backends::aoti::slim::c10::Float8_e4m3fnuz::from_bits()); } - static constexpr standalone::c10::Float8_e4m3fnuz infinity() { + static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fnuz + infinity() { // NaN (no infinities) - return standalone::c10::Float8_e4m3fnuz( - 0x80, standalone::c10::Float8_e4m3fnuz::from_bits()); + return executorch::backends::aoti::slim::c10::Float8_e4m3fnuz( + 0x80, + executorch::backends::aoti::slim::c10::Float8_e4m3fnuz::from_bits()); } - static constexpr standalone::c10::Float8_e4m3fnuz quiet_NaN() { - return standalone::c10::Float8_e4m3fnuz( - 0x80, standalone::c10::Float8_e4m3fnuz::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fnuz + quiet_NaN() { + return executorch::backends::aoti::slim::c10::Float8_e4m3fnuz( + 0x80, + executorch::backends::aoti::slim::c10::Float8_e4m3fnuz::from_bits()); } - static constexpr standalone::c10::Float8_e4m3fnuz denorm_min() { - return standalone::c10::Float8_e4m3fnuz( - 0x01, standalone::c10::Float8_e4m3fnuz::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e4m3fnuz + denorm_min() { + return executorch::backends::aoti::slim::c10::Float8_e4m3fnuz( + 0x01, + executorch::backends::aoti::slim::c10::Float8_e4m3fnuz::from_bits()); } }; diff --git a/backends/aoti/slim/c10/util/Float8_e4m3fnuz.h b/backends/aoti/slim/c10/util/Float8_e4m3fnuz.h index ff3c050f018..b9c8ae582f4 100644 --- a/backends/aoti/slim/c10/util/Float8_e4m3fnuz.h +++ b/backends/aoti/slim/c10/util/Float8_e4m3fnuz.h @@ -31,7 +31,7 @@ #include #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { namespace detail { @@ -133,6 +133,6 @@ inline std::ostream& operator<<( return out; } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 #include // IWYU pragma: keep diff --git a/backends/aoti/slim/c10/util/Float8_e5m2-inl.h b/backends/aoti/slim/c10/util/Float8_e5m2-inl.h index c8e90a8aa0d..bdc80613015 100644 --- a/backends/aoti/slim/c10/util/Float8_e5m2-inl.h +++ b/backends/aoti/slim/c10/util/Float8_e5m2-inl.h @@ -13,7 +13,7 @@ STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion") #define MAN_WIDTH_FP8 2 #define EXP_BIAS_FP8 15 -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { /// Constructors @@ -229,14 +229,14 @@ inline STANDALONE_HOST_DEVICE Float8_e5m2 operator/(int64_t a, Float8_e5m2 b) { } /// NOTE: we do not define comparisons directly and instead rely on the implicit -/// conversion from standalone::c10::Float8_e5m2 to float. +/// conversion from executorch::backends::aoti::slim::c10::Float8_e5m2 to float. -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 namespace std { template <> -class numeric_limits { +class numeric_limits { public: static constexpr bool is_signed = true; static constexpr bool is_integer = false; @@ -263,37 +263,42 @@ class numeric_limits { static constexpr auto tinyness_before = numeric_limits::tinyness_before; - static constexpr standalone::c10::Float8_e5m2 min() { - return standalone::c10::Float8_e5m2( - 0x4, standalone::c10::Float8_e5m2::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2 min() { + return executorch::backends::aoti::slim::c10::Float8_e5m2( + 0x4, executorch::backends::aoti::slim::c10::Float8_e5m2::from_bits()); } - static constexpr standalone::c10::Float8_e5m2 max() { - return standalone::c10::Float8_e5m2( - 0x7B, standalone::c10::Float8_e5m2::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2 max() { + return executorch::backends::aoti::slim::c10::Float8_e5m2( + 0x7B, executorch::backends::aoti::slim::c10::Float8_e5m2::from_bits()); } - static constexpr standalone::c10::Float8_e5m2 lowest() { - return standalone::c10::Float8_e5m2( - 0xFB, standalone::c10::Float8_e5m2::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2 lowest() { + return executorch::backends::aoti::slim::c10::Float8_e5m2( + 0xFB, executorch::backends::aoti::slim::c10::Float8_e5m2::from_bits()); } - static constexpr standalone::c10::Float8_e5m2 epsilon() { - return standalone::c10::Float8_e5m2( - 0x34, standalone::c10::Float8_e5m2::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2 + epsilon() { + return executorch::backends::aoti::slim::c10::Float8_e5m2( + 0x34, executorch::backends::aoti::slim::c10::Float8_e5m2::from_bits()); } - static constexpr standalone::c10::Float8_e5m2 round_error() { - return standalone::c10::Float8_e5m2( - 0x38, standalone::c10::Float8_e5m2::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2 + round_error() { + return executorch::backends::aoti::slim::c10::Float8_e5m2( + 0x38, executorch::backends::aoti::slim::c10::Float8_e5m2::from_bits()); } - static constexpr standalone::c10::Float8_e5m2 infinity() { - return standalone::c10::Float8_e5m2( - 0x7C, standalone::c10::Float8_e5m2::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2 + infinity() { + return executorch::backends::aoti::slim::c10::Float8_e5m2( + 0x7C, executorch::backends::aoti::slim::c10::Float8_e5m2::from_bits()); } - static constexpr standalone::c10::Float8_e5m2 quiet_NaN() { - return standalone::c10::Float8_e5m2( - 0x7F, standalone::c10::Float8_e5m2::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2 + quiet_NaN() { + return executorch::backends::aoti::slim::c10::Float8_e5m2( + 0x7F, executorch::backends::aoti::slim::c10::Float8_e5m2::from_bits()); } - static constexpr standalone::c10::Float8_e5m2 denorm_min() { - return standalone::c10::Float8_e5m2( - 0x01, standalone::c10::Float8_e5m2::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2 + denorm_min() { + return executorch::backends::aoti::slim::c10::Float8_e5m2( + 0x01, executorch::backends::aoti::slim::c10::Float8_e5m2::from_bits()); } }; diff --git a/backends/aoti/slim/c10/util/Float8_e5m2.h b/backends/aoti/slim/c10/util/Float8_e5m2.h index 88d1aab0525..6e9fa9b5aed 100644 --- a/backends/aoti/slim/c10/util/Float8_e5m2.h +++ b/backends/aoti/slim/c10/util/Float8_e5m2.h @@ -16,7 +16,7 @@ #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { namespace detail { @@ -142,6 +142,6 @@ inline std::ostream& operator<<(std::ostream& out, const Float8_e5m2& value) { return out; } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 #include // IWYU pragma: keep diff --git a/backends/aoti/slim/c10/util/Float8_e5m2fnuz-inl.h b/backends/aoti/slim/c10/util/Float8_e5m2fnuz-inl.h index d2ccac329af..ca46726424b 100644 --- a/backends/aoti/slim/c10/util/Float8_e5m2fnuz-inl.h +++ b/backends/aoti/slim/c10/util/Float8_e5m2fnuz-inl.h @@ -10,7 +10,7 @@ STANDALONE_CLANG_DIAGNOSTIC_PUSH() STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion") #endif -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { /// Constructors @@ -243,14 +243,15 @@ operator/(int64_t a, Float8_e5m2fnuz b) { } /// NOTE: we do not define comparisons directly and instead rely on the implicit -/// conversion from standalone::c10::Float8_e5m2fnuz to float. +/// conversion from executorch::backends::aoti::slim::c10::Float8_e5m2fnuz to +/// float. -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 namespace std { template <> -class numeric_limits { +class numeric_limits { public: static constexpr bool is_signed = true; static constexpr bool is_integer = false; @@ -277,39 +278,55 @@ class numeric_limits { static constexpr auto tinyness_before = numeric_limits::tinyness_before; - static constexpr standalone::c10::Float8_e5m2fnuz min() { - return standalone::c10::Float8_e5m2fnuz( - 0x04, standalone::c10::Float8_e5m2fnuz::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2fnuz + min() { + return executorch::backends::aoti::slim::c10::Float8_e5m2fnuz( + 0x04, + executorch::backends::aoti::slim::c10::Float8_e5m2fnuz::from_bits()); } - static constexpr standalone::c10::Float8_e5m2fnuz max() { - return standalone::c10::Float8_e5m2fnuz( - 0x7F, standalone::c10::Float8_e5m2fnuz::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2fnuz + max() { + return executorch::backends::aoti::slim::c10::Float8_e5m2fnuz( + 0x7F, + executorch::backends::aoti::slim::c10::Float8_e5m2fnuz::from_bits()); } - static constexpr standalone::c10::Float8_e5m2fnuz lowest() { - return standalone::c10::Float8_e5m2fnuz( - 0xFF, standalone::c10::Float8_e5m2fnuz::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2fnuz + lowest() { + return executorch::backends::aoti::slim::c10::Float8_e5m2fnuz( + 0xFF, + executorch::backends::aoti::slim::c10::Float8_e5m2fnuz::from_bits()); } - static constexpr standalone::c10::Float8_e5m2fnuz epsilon() { - return standalone::c10::Float8_e5m2fnuz( - 0x34, standalone::c10::Float8_e5m2fnuz::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2fnuz + epsilon() { + return executorch::backends::aoti::slim::c10::Float8_e5m2fnuz( + 0x34, + executorch::backends::aoti::slim::c10::Float8_e5m2fnuz::from_bits()); } - static constexpr standalone::c10::Float8_e5m2fnuz round_error() { - return standalone::c10::Float8_e5m2fnuz( - 0x38, standalone::c10::Float8_e5m2fnuz::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2fnuz + round_error() { + return executorch::backends::aoti::slim::c10::Float8_e5m2fnuz( + 0x38, + executorch::backends::aoti::slim::c10::Float8_e5m2fnuz::from_bits()); } - static constexpr standalone::c10::Float8_e5m2fnuz infinity() { - return standalone::c10::Float8_e5m2fnuz( - 0x80, standalone::c10::Float8_e5m2fnuz::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2fnuz + infinity() { + return executorch::backends::aoti::slim::c10::Float8_e5m2fnuz( + 0x80, + executorch::backends::aoti::slim::c10::Float8_e5m2fnuz::from_bits()); } // TODO(future): we are mapping neg_zero to both inf and NaN, this is // surprising and we should figure out what to do about it. - static constexpr standalone::c10::Float8_e5m2fnuz quiet_NaN() { - return standalone::c10::Float8_e5m2fnuz( - 0x80, standalone::c10::Float8_e5m2fnuz::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2fnuz + quiet_NaN() { + return executorch::backends::aoti::slim::c10::Float8_e5m2fnuz( + 0x80, + executorch::backends::aoti::slim::c10::Float8_e5m2fnuz::from_bits()); } - static constexpr standalone::c10::Float8_e5m2fnuz denorm_min() { - return standalone::c10::Float8_e5m2fnuz( - 0x01, standalone::c10::Float8_e5m2fnuz::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e5m2fnuz + denorm_min() { + return executorch::backends::aoti::slim::c10::Float8_e5m2fnuz( + 0x01, + executorch::backends::aoti::slim::c10::Float8_e5m2fnuz::from_bits()); } }; diff --git a/backends/aoti/slim/c10/util/Float8_e5m2fnuz.h b/backends/aoti/slim/c10/util/Float8_e5m2fnuz.h index c16e5613202..66c2427c8ac 100644 --- a/backends/aoti/slim/c10/util/Float8_e5m2fnuz.h +++ b/backends/aoti/slim/c10/util/Float8_e5m2fnuz.h @@ -31,7 +31,7 @@ #include #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { namespace detail { @@ -133,6 +133,6 @@ inline std::ostream& operator<<( return out; } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 #include // IWYU pragma: keep diff --git a/backends/aoti/slim/c10/util/Float8_e8m0fnu-inl.h b/backends/aoti/slim/c10/util/Float8_e8m0fnu-inl.h index f510ca551b8..4e35e04bc22 100644 --- a/backends/aoti/slim/c10/util/Float8_e8m0fnu-inl.h +++ b/backends/aoti/slim/c10/util/Float8_e8m0fnu-inl.h @@ -11,7 +11,7 @@ STANDALONE_CLANG_DIAGNOSTIC_PUSH() STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion") #endif -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { /// Constructors @@ -25,18 +25,20 @@ inline STANDALONE_HOST_DEVICE Float8_e8m0fnu::operator float() const { // if exponent is zero, need to special case to return 2^-127 instead of zero if (x == 0) { - return standalone::c10::detail::fp32_from_bits(0x00400000); + return executorch::backends::aoti::slim::c10::detail::fp32_from_bits( + 0x00400000); } // if exponent is NaN, need to special case to return properly encoded NaN if (isnan()) { - return standalone::c10::detail::fp32_from_bits(0x7f800001); + return executorch::backends::aoti::slim::c10::detail::fp32_from_bits( + 0x7f800001); } // leave sign at 0, set the exponent bits, leave stored mantissa at 0 uint32_t res = x << 23; - return standalone::c10::detail::fp32_from_bits(res); + return executorch::backends::aoti::slim::c10::detail::fp32_from_bits(res); } /// Special values helper @@ -46,14 +48,15 @@ inline STANDALONE_HOST_DEVICE bool Float8_e8m0fnu::isnan() const { } /// NOTE: we do not define comparisons directly and instead rely on the implicit -/// conversion from standalone::c10::Float8_e8m0fnu to float. +/// conversion from executorch::backends::aoti::slim::c10::Float8_e8m0fnu to +/// float. -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 namespace std { template <> -class numeric_limits { +class numeric_limits { public: static constexpr bool is_specialized = true; static constexpr bool is_signed = false; @@ -79,37 +82,47 @@ class numeric_limits { static constexpr auto traps = numeric_limits::traps; static constexpr auto tinyness_before = false; - static constexpr standalone::c10::Float8_e8m0fnu min() { + static constexpr executorch::backends::aoti::slim::c10::Float8_e8m0fnu min() { // 2^-127 - return standalone::c10::Float8_e8m0fnu( - 0b00000000, standalone::c10::Float8_e8m0fnu::from_bits()); + return executorch::backends::aoti::slim::c10::Float8_e8m0fnu( + 0b00000000, + executorch::backends::aoti::slim::c10::Float8_e8m0fnu::from_bits()); } - static constexpr standalone::c10::Float8_e8m0fnu lowest() { + static constexpr executorch::backends::aoti::slim::c10::Float8_e8m0fnu + lowest() { // 2^-127 - return standalone::c10::Float8_e8m0fnu( - 0b00000000, standalone::c10::Float8_e8m0fnu::from_bits()); + return executorch::backends::aoti::slim::c10::Float8_e8m0fnu( + 0b00000000, + executorch::backends::aoti::slim::c10::Float8_e8m0fnu::from_bits()); } - static constexpr standalone::c10::Float8_e8m0fnu max() { + static constexpr executorch::backends::aoti::slim::c10::Float8_e8m0fnu max() { // 254 biased, which is 127 unbiased, so 2^127 - return standalone::c10::Float8_e8m0fnu( - 0b11111110, standalone::c10::Float8_e8m0fnu::from_bits()); + return executorch::backends::aoti::slim::c10::Float8_e8m0fnu( + 0b11111110, + executorch::backends::aoti::slim::c10::Float8_e8m0fnu::from_bits()); } - static constexpr standalone::c10::Float8_e8m0fnu epsilon() { + static constexpr executorch::backends::aoti::slim::c10::Float8_e8m0fnu + epsilon() { // according to https://en.cppreference.com/w/cpp/types/numeric_limits, this // is "the difference between 1.0 and the next representable value of the // given floating-point type". The next representable value is 2.0, so the // difference is 1.0 which is 2^0. 0 unbiased is 127 biased. - return standalone::c10::Float8_e8m0fnu( - 0b01111111, standalone::c10::Float8_e8m0fnu::from_bits()); + return executorch::backends::aoti::slim::c10::Float8_e8m0fnu( + 0b01111111, + executorch::backends::aoti::slim::c10::Float8_e8m0fnu::from_bits()); } - static constexpr standalone::c10::Float8_e8m0fnu round_error() { + static constexpr executorch::backends::aoti::slim::c10::Float8_e8m0fnu + round_error() { // 0.5 in float, which is 2^-1, and -1 + 127 = 126 - return standalone::c10::Float8_e8m0fnu( - 0b01111110, standalone::c10::Float8_e8m0fnu::from_bits()); + return executorch::backends::aoti::slim::c10::Float8_e8m0fnu( + 0b01111110, + executorch::backends::aoti::slim::c10::Float8_e8m0fnu::from_bits()); } - static constexpr standalone::c10::Float8_e8m0fnu quiet_NaN() { - return standalone::c10::Float8_e8m0fnu( - 0b11111111, standalone::c10::Float8_e8m0fnu::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Float8_e8m0fnu + quiet_NaN() { + return executorch::backends::aoti::slim::c10::Float8_e8m0fnu( + 0b11111111, + executorch::backends::aoti::slim::c10::Float8_e8m0fnu::from_bits()); } }; diff --git a/backends/aoti/slim/c10/util/Float8_e8m0fnu.h b/backends/aoti/slim/c10/util/Float8_e8m0fnu.h index 2e2e46d627a..0f67705c510 100644 --- a/backends/aoti/slim/c10/util/Float8_e8m0fnu.h +++ b/backends/aoti/slim/c10/util/Float8_e8m0fnu.h @@ -27,7 +27,7 @@ #include #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { namespace detail { @@ -38,7 +38,8 @@ namespace detail { inline STANDALONE_HOST_DEVICE uint8_t fp8e8m0fnu_from_fp32_value(float f) { // TODO(#146647): maybe rewrite without control flow - uint32_t f_bits = standalone::c10::detail::fp32_to_bits(f); + uint32_t f_bits = + executorch::backends::aoti::slim::c10::detail::fp32_to_bits(f); // extract the exponent uint32_t exponent = (f_bits >> 23) & 0b11111111; @@ -114,6 +115,6 @@ inline std::ostream& operator<<( return out; } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 #include // IWYU pragma: keep diff --git a/backends/aoti/slim/c10/util/Float8_fnuz_cvt.h b/backends/aoti/slim/c10/util/Float8_fnuz_cvt.h index 00bfa8cd8fc..49bcaad6842 100644 --- a/backends/aoti/slim/c10/util/Float8_fnuz_cvt.h +++ b/backends/aoti/slim/c10/util/Float8_fnuz_cvt.h @@ -8,7 +8,7 @@ #include #endif -namespace standalone::c10::detail { +namespace executorch::backends::aoti::slim::c10::detail { /* * Convert a 8-bit floating-point number in either f8 E4M3FNUZ or bf8 E5M2FNUZ @@ -61,4 +61,4 @@ inline STANDALONE_HOST_DEVICE float fp8_fnuz_to_fp32_value(uint8_t x) { return fp32_from_bits(retval); } -} // namespace standalone::c10::detail +} // namespace executorch::backends::aoti::slim::c10::detail diff --git a/backends/aoti/slim/c10/util/Half-inl.h b/backends/aoti/slim/c10/util/Half-inl.h index 05fa6349f81..f7b25c0ebe0 100644 --- a/backends/aoti/slim/c10/util/Half-inl.h +++ b/backends/aoti/slim/c10/util/Half-inl.h @@ -31,7 +31,7 @@ STANDALONE_CLANG_DIAGNOSTIC_PUSH() STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion") #endif -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { #if defined(__aarch64__) && !defined(__CUDACC__) /// Constructors @@ -46,7 +46,8 @@ inline STANDALONE_HOST_DEVICE Half::Half(float value) #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) x(__half_as_short(__float2half(value))) #elif defined(__SYCL_DEVICE_ONLY__) - x(standalone::c10::bit_cast(sycl::half(value))) + x(executorch::backends::aoti::slim::c10::bit_cast( + sycl::half(value))) #elif (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \ !defined(__APPLE__) x(at::vec::float2half_scalar(value)) @@ -62,7 +63,7 @@ inline STANDALONE_HOST_DEVICE Half::operator float() const { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) return __half2float(*reinterpret_cast(&x)); #elif defined(__SYCL_DEVICE_ONLY__) - return float(standalone::c10::bit_cast(x)); + return float(executorch::backends::aoti::slim::c10::bit_cast(x)); #elif (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \ !defined(__APPLE__) return at::vec::half2float_scalar(x); @@ -127,7 +128,7 @@ inline STANDALONE_HOST_DEVICE Half operator-(const Half& a) { defined(__HIP_DEVICE_COMPILE__) return __hneg(a); #elif defined(__SYCL_DEVICE_ONLY__) - return -standalone::c10::bit_cast(a); + return -executorch::backends::aoti::slim::c10::bit_cast(a); #else return -static_cast(a); #endif @@ -283,14 +284,14 @@ inline STANDALONE_HOST_DEVICE Half operator/(int64_t a, Half b) { } /// NOTE: we do not define comparisons directly and instead rely on the implicit -/// conversion from standalone::c10::Half to float. +/// conversion from executorch::backends::aoti::slim::c10::Half to float. -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 namespace std { template <> -class numeric_limits { +class numeric_limits { public: static constexpr bool is_specialized = true; static constexpr bool is_signed = true; @@ -317,32 +318,41 @@ class numeric_limits { static constexpr auto traps = numeric_limits::traps; static constexpr auto tinyness_before = numeric_limits::tinyness_before; - static constexpr standalone::c10::Half min() { - return standalone::c10::Half(0x0400, standalone::c10::Half::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Half min() { + return executorch::backends::aoti::slim::c10::Half( + 0x0400, executorch::backends::aoti::slim::c10::Half::from_bits()); } - static constexpr standalone::c10::Half lowest() { - return standalone::c10::Half(0xFBFF, standalone::c10::Half::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Half lowest() { + return executorch::backends::aoti::slim::c10::Half( + 0xFBFF, executorch::backends::aoti::slim::c10::Half::from_bits()); } - static constexpr standalone::c10::Half max() { - return standalone::c10::Half(0x7BFF, standalone::c10::Half::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Half max() { + return executorch::backends::aoti::slim::c10::Half( + 0x7BFF, executorch::backends::aoti::slim::c10::Half::from_bits()); } - static constexpr standalone::c10::Half epsilon() { - return standalone::c10::Half(0x1400, standalone::c10::Half::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Half epsilon() { + return executorch::backends::aoti::slim::c10::Half( + 0x1400, executorch::backends::aoti::slim::c10::Half::from_bits()); } - static constexpr standalone::c10::Half round_error() { - return standalone::c10::Half(0x3800, standalone::c10::Half::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Half round_error() { + return executorch::backends::aoti::slim::c10::Half( + 0x3800, executorch::backends::aoti::slim::c10::Half::from_bits()); } - static constexpr standalone::c10::Half infinity() { - return standalone::c10::Half(0x7C00, standalone::c10::Half::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Half infinity() { + return executorch::backends::aoti::slim::c10::Half( + 0x7C00, executorch::backends::aoti::slim::c10::Half::from_bits()); } - static constexpr standalone::c10::Half quiet_NaN() { - return standalone::c10::Half(0x7E00, standalone::c10::Half::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Half quiet_NaN() { + return executorch::backends::aoti::slim::c10::Half( + 0x7E00, executorch::backends::aoti::slim::c10::Half::from_bits()); } - static constexpr standalone::c10::Half signaling_NaN() { - return standalone::c10::Half(0x7D00, standalone::c10::Half::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Half signaling_NaN() { + return executorch::backends::aoti::slim::c10::Half( + 0x7D00, executorch::backends::aoti::slim::c10::Half::from_bits()); } - static constexpr standalone::c10::Half denorm_min() { - return standalone::c10::Half(0x0001, standalone::c10::Half::from_bits()); + static constexpr executorch::backends::aoti::slim::c10::Half denorm_min() { + return executorch::backends::aoti::slim::c10::Half( + 0x0001, executorch::backends::aoti::slim::c10::Half::from_bits()); } }; diff --git a/backends/aoti/slim/c10/util/Half.h b/backends/aoti/slim/c10/util/Half.h index 86f8d8683e0..26597d23e53 100644 --- a/backends/aoti/slim/c10/util/Half.h +++ b/backends/aoti/slim/c10/util/Half.h @@ -61,7 +61,7 @@ #endif // __x86_64__ || _M_X64 || __i386 || _M_IX86 #endif // __GNUC__ || __clang__ -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { namespace detail { @@ -359,11 +359,11 @@ inline uint16_t fp16_ieee_from_fp32_value(float f) { #if defined(__aarch64__) && !defined(__CUDACC__) inline float16_t fp16_from_bits(uint16_t h) { - return standalone::c10::bit_cast(h); + return executorch::backends::aoti::slim::c10::bit_cast(h); } inline uint16_t fp16_to_bits(float16_t f) { - return standalone::c10::bit_cast(f); + return executorch::backends::aoti::slim::c10::bit_cast(f); } // According to https://godbolt.org/z/frExdbsWG it would translate to single @@ -419,6 +419,6 @@ inline std::ostream& operator<<(std::ostream& out, const Half& value) { return out; } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 #include // IWYU pragma: keep diff --git a/backends/aoti/slim/c10/util/StringUtil.h b/backends/aoti/slim/c10/util/StringUtil.h index ff7c591e734..8a696322716 100644 --- a/backends/aoti/slim/c10/util/StringUtil.h +++ b/backends/aoti/slim/c10/util/StringUtil.h @@ -3,7 +3,7 @@ #include #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { template inline std::string Join(const std::string& delimiter, const Container& v) { std::stringstream s; @@ -13,4 +13,4 @@ inline std::string Join(const std::string& delimiter, const Container& v) { } return std::move(s).str(); } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/util/TypeCast.h b/backends/aoti/slim/c10/util/TypeCast.h index cfaaaebec95..e3d65a7ef16 100644 --- a/backends/aoti/slim/c10/util/TypeCast.h +++ b/backends/aoti/slim/c10/util/TypeCast.h @@ -20,7 +20,7 @@ STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion") STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion") #endif -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { template struct needs_real { @@ -103,66 +103,76 @@ struct static_cast_with_inter_type { template <> struct static_cast_with_inter_type< - standalone::c10::complex, - standalone::c10::BFloat16> { + executorch::backends::aoti::slim::c10::complex< + executorch::backends::aoti::slim::c10::Half>, + executorch::backends::aoti::slim::c10::BFloat16> { STANDALONE_HOST_DEVICE - __ubsan_ignore_undefined__ static inline standalone::c10::complex< - standalone::c10::Half> - apply(standalone::c10::BFloat16 src) { - return static_cast>( - standalone::c10::complex{src}); + __ubsan_ignore_undefined__ static inline executorch::backends::aoti::slim:: + c10::complex + apply(executorch::backends::aoti::slim::c10::BFloat16 src) { + return static_cast>( + executorch::backends::aoti::slim::c10::complex{src}); } }; template <> struct static_cast_with_inter_type< - standalone::c10::complex, - standalone::c10::Float8_e5m2> { + executorch::backends::aoti::slim::c10::complex< + executorch::backends::aoti::slim::c10::Half>, + executorch::backends::aoti::slim::c10::Float8_e5m2> { STANDALONE_HOST_DEVICE - __ubsan_ignore_undefined__ static inline standalone::c10::complex< - standalone::c10::Half> - apply(standalone::c10::Float8_e5m2 src) { - return static_cast>( - standalone::c10::complex{src}); + __ubsan_ignore_undefined__ static inline executorch::backends::aoti::slim:: + c10::complex + apply(executorch::backends::aoti::slim::c10::Float8_e5m2 src) { + return static_cast>( + executorch::backends::aoti::slim::c10::complex{src}); } }; template <> struct static_cast_with_inter_type< - standalone::c10::complex, - standalone::c10::Float8_e5m2fnuz> { + executorch::backends::aoti::slim::c10::complex< + executorch::backends::aoti::slim::c10::Half>, + executorch::backends::aoti::slim::c10::Float8_e5m2fnuz> { STANDALONE_HOST_DEVICE - __ubsan_ignore_undefined__ static inline standalone::c10::complex< - standalone::c10::Half> - apply(standalone::c10::Float8_e5m2fnuz src) { - return static_cast>( - standalone::c10::complex{src}); + __ubsan_ignore_undefined__ static inline executorch::backends::aoti::slim:: + c10::complex + apply(executorch::backends::aoti::slim::c10::Float8_e5m2fnuz src) { + return static_cast>( + executorch::backends::aoti::slim::c10::complex{src}); } }; template <> struct static_cast_with_inter_type< - standalone::c10::complex, - standalone::c10::Float8_e4m3fn> { + executorch::backends::aoti::slim::c10::complex< + executorch::backends::aoti::slim::c10::Half>, + executorch::backends::aoti::slim::c10::Float8_e4m3fn> { STANDALONE_HOST_DEVICE - __ubsan_ignore_undefined__ static inline standalone::c10::complex< - standalone::c10::Half> - apply(standalone::c10::Float8_e4m3fn src) { - return static_cast>( - standalone::c10::complex{src}); + __ubsan_ignore_undefined__ static inline executorch::backends::aoti::slim:: + c10::complex + apply(executorch::backends::aoti::slim::c10::Float8_e4m3fn src) { + return static_cast>( + executorch::backends::aoti::slim::c10::complex{src}); } }; template <> struct static_cast_with_inter_type< - standalone::c10::complex, - standalone::c10::Float8_e4m3fnuz> { + executorch::backends::aoti::slim::c10::complex< + executorch::backends::aoti::slim::c10::Half>, + executorch::backends::aoti::slim::c10::Float8_e4m3fnuz> { STANDALONE_HOST_DEVICE - __ubsan_ignore_undefined__ static inline standalone::c10::complex< - standalone::c10::Half> - apply(standalone::c10::Float8_e4m3fnuz src) { - return static_cast>( - standalone::c10::complex{src}); + __ubsan_ignore_undefined__ static inline executorch::backends::aoti::slim:: + c10::complex + apply(executorch::backends::aoti::slim::c10::Float8_e4m3fnuz src) { + return static_cast>( + executorch::backends::aoti::slim::c10::complex{src}); } }; @@ -170,40 +180,47 @@ struct static_cast_with_inter_type< // based off our apply macros? template <> struct static_cast_with_inter_type< - standalone::c10::complex, - standalone::c10::Float8_e8m0fnu> { + executorch::backends::aoti::slim::c10::complex< + executorch::backends::aoti::slim::c10::Half>, + executorch::backends::aoti::slim::c10::Float8_e8m0fnu> { STANDALONE_HOST_DEVICE - __ubsan_ignore_undefined__ static inline standalone::c10::complex< - standalone::c10::Half> - apply(standalone::c10::Float8_e8m0fnu src) { - return static_cast>( - standalone::c10::complex{src}); + __ubsan_ignore_undefined__ static inline executorch::backends::aoti::slim:: + c10::complex + apply(executorch::backends::aoti::slim::c10::Float8_e8m0fnu src) { + return static_cast>( + executorch::backends::aoti::slim::c10::complex{src}); } }; template <> struct static_cast_with_inter_type< - standalone::c10::complex, - standalone::c10::Half> { + executorch::backends::aoti::slim::c10::complex< + executorch::backends::aoti::slim::c10::Half>, + executorch::backends::aoti::slim::c10::Half> { STANDALONE_HOST_DEVICE - __ubsan_ignore_undefined__ static inline standalone::c10::complex< - standalone::c10::Half> - apply(standalone::c10::Half src) { - return static_cast>( - standalone::c10::complex{src}); + __ubsan_ignore_undefined__ static inline executorch::backends::aoti::slim:: + c10::complex + apply(executorch::backends::aoti::slim::c10::Half src) { + return static_cast>( + executorch::backends::aoti::slim::c10::complex{src}); } }; template <> struct static_cast_with_inter_type< - standalone::c10::complex, - standalone::c10::complex> { + executorch::backends::aoti::slim::c10::complex< + executorch::backends::aoti::slim::c10::Half>, + executorch::backends::aoti::slim::c10::complex> { STANDALONE_HOST_DEVICE - __ubsan_ignore_undefined__ static inline standalone::c10::complex< - standalone::c10::Half> - apply(standalone::c10::complex src) { - return static_cast>( - static_cast>(src)); + __ubsan_ignore_undefined__ static inline executorch::backends::aoti::slim:: + c10::complex + apply(executorch::backends::aoti::slim::c10::complex src) { + return static_cast>( + static_cast>( + src)); } }; @@ -229,7 +246,7 @@ To checked_convert(From f, const char* name) { return convert(f); } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 STANDALONE_CLANG_DIAGNOSTIC_POP() diff --git a/backends/aoti/slim/c10/util/TypeSafeSignMath.h b/backends/aoti/slim/c10/util/TypeSafeSignMath.h index 276b1cee7d0..7e23f64a39e 100644 --- a/backends/aoti/slim/c10/util/TypeSafeSignMath.h +++ b/backends/aoti/slim/c10/util/TypeSafeSignMath.h @@ -13,7 +13,7 @@ STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wstring-conversion") STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion") #endif -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { /// Returns false since we cannot have x < 0 if x is unsigned. template @@ -33,7 +33,8 @@ inline constexpr bool is_negative(const T& x, std::false_type /*is_unsigned*/) { /// NOTE: Will fail on an unsigned custom type /// For the most part it's possible to fix this if /// the custom type has a constexpr constructor. -/// However, notably, standalone::c10::Half does not :-( +/// However, notably, executorch::backends::aoti::slim::c10::Half does not +/// :-( template inline constexpr bool is_negative(const T& x) { return is_negative(x, std::is_unsigned()); @@ -55,7 +56,8 @@ inline constexpr int signum(const T& x, std::false_type /*is_unsigned*/) { /// NOTE: Will fail on an unsigned custom type /// For the most part it's possible to fix this if /// the custom type has a constexpr constructor. -/// However, notably, standalone::c10::Half does not :-( +/// However, notably, executorch::backends::aoti::slim::c10::Half does not +/// :-( template inline constexpr int signum(const T& x) { return signum(x, std::is_unsigned()); @@ -129,13 +131,14 @@ inline constexpr bool less_than_lowest( /// NOTE: Will fail on an unsigned custom type /// For the most part it's possible to fix this if /// the custom type has a constexpr constructor. -/// However, notably, standalone::c10::Half does not : +/// However, notably, executorch::backends::aoti::slim::c10::Half does not +/// : template inline constexpr bool less_than_lowest(const T& x) { return less_than_lowest( x, std::is_unsigned(), std::is_unsigned()); } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 STANDALONE_CLANG_DIAGNOSTIC_POP() diff --git a/backends/aoti/slim/c10/util/accumulate.h b/backends/aoti/slim/c10/util/accumulate.h index 4972dd9826a..578c6246b29 100644 --- a/backends/aoti/slim/c10/util/accumulate.h +++ b/backends/aoti/slim/c10/util/accumulate.h @@ -11,7 +11,7 @@ #include #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { /// Sum of a list of integers; accumulates into the int64_t datatype template < @@ -122,4 +122,4 @@ inline int64_t numelements_between_dim(int k, int l, const C& dims) { return multiply_integers(cbegin, cend); } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/util/bit_cast.h b/backends/aoti/slim/c10/util/bit_cast.h index 765ec641486..5a1e1208acf 100644 --- a/backends/aoti/slim/c10/util/bit_cast.h +++ b/backends/aoti/slim/c10/util/bit_cast.h @@ -11,7 +11,7 @@ #endif // __has_include() && (__cplusplus >= 202002L || // (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L)) -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { #if STANDALONE_HAVE_STD_BIT_CAST using std::bit_cast; @@ -41,4 +41,4 @@ bit_cast(const From& src) noexcept { #endif // STANDALONE_HAVE_STD_BIT_CAST #undef STANDALONE_HAVE_STD_BIT_CAST -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/util/bits.h b/backends/aoti/slim/c10/util/bits.h index 2d365463a01..d04f88dafc8 100644 --- a/backends/aoti/slim/c10/util/bits.h +++ b/backends/aoti/slim/c10/util/bits.h @@ -3,7 +3,7 @@ #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { /** * bits1x8 is an uninterpreted dtype of a tensor with 1 bit (packed to byte @@ -58,4 +58,4 @@ struct alignas(2) bits16 { STANDALONE_HOST_DEVICE explicit bits16(uint16_t val) : val_(val) {} }; -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/util/complex.h b/backends/aoti/slim/c10/util/complex.h index 988e446b3e4..b48ef792ed7 100644 --- a/backends/aoti/slim/c10/util/complex.h +++ b/backends/aoti/slim/c10/util/complex.h @@ -17,19 +17,19 @@ STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion") STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wfloat-conversion") #endif -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { -// standalone::c10::complex is an implementation of complex numbers that aims -// to work on all devices supported by PyTorch +// executorch::backends::aoti::slim::c10::complex is an implementation of +// complex numbers that aims to work on all devices supported by PyTorch // // Most of the APIs duplicates std::complex // Reference: https://en.cppreference.com/w/cpp/numeric/complex // // [NOTE: Complex Operator Unification] // Operators currently use a mix of std::complex, thrust::complex, and -// standalone::c10::complex internally. The end state is that all operators -// will use standalone::c10::complex internally. Until then, there may be -// some hacks to support all variants. +// executorch::backends::aoti::slim::c10::complex internally. The end state is +// that all operators will use executorch::backends::aoti::slim::c10::complex +// internally. Until then, there may be some hacks to support all variants. // // // [Note on Constructors] @@ -89,9 +89,9 @@ namespace standalone::c10 { // // std::complex has custom literals `i`, `if` and `il` defined in namespace // `std::literals::complex_literals`. We define our own custom literals in the -// namespace `standalone::c10::complex_literals`. Our custom literals does not -// follow the same behavior as in std::complex, instead, we define _if, _id to -// construct float/double complex literals. +// namespace `executorch::backends::aoti::slim::c10::complex_literals`. Our +// custom literals does not follow the same behavior as in std::complex, +// instead, we define _if, _id to construct float/double complex literals. // // // [real() and imag()] @@ -138,9 +138,11 @@ namespace standalone::c10 { // // // -// TODO(@zasdfgbnm): standalone::c10::complex is not -// currently supported, because: -// - lots of members and functions of standalone::c10::Half are not constexpr +// TODO(@zasdfgbnm): +// executorch::backends::aoti::slim::c10::complex +// is not currently supported, because: +// - lots of members and functions of +// executorch::backends::aoti::slim::c10::Half are not constexpr // - thrust::complex only support float and double template @@ -166,7 +168,8 @@ struct alignas(sizeof(T) * 2) complex { #endif // Use SFINAE to specialize casting constructor for - // standalone::c10::complex and standalone::c10::complex + // executorch::backends::aoti::slim::c10::complex and + // executorch::backends::aoti::slim::c10::complex template STANDALONE_HOST_DEVICE explicit constexpr complex( const std::enable_if_t, complex>& other) @@ -430,69 +433,69 @@ constexpr complex operator/(const T& lhs, const complex& rhs) { return result /= rhs; } -// Define operators between integral scalars and standalone::c10::complex. -// std::complex does not support this when T is a floating-point number. This is -// useful because it saves a lot of "static_cast" when operate a complex and an -// integer. This makes the code both less verbose and potentially more -// efficient. +// Define operators between integral scalars and +// executorch::backends::aoti::slim::c10::complex. std::complex does not support +// this when T is a floating-point number. This is useful because it saves a lot +// of "static_cast" when operate a complex and an integer. This makes the code +// both less verbose and potentially more efficient. #define COMPLEX_INTEGER_OP_TEMPLATE_CONDITION \ typename std::enable_if_t< \ std::is_floating_point_v && std::is_integral_v, \ int> = 0 template -constexpr standalone::c10::complex operator+( - const standalone::c10::complex& a, +constexpr executorch::backends::aoti::slim::c10::complex operator+( + const executorch::backends::aoti::slim::c10::complex& a, const iT& b) { return a + static_cast(b); } template -constexpr standalone::c10::complex operator+( +constexpr executorch::backends::aoti::slim::c10::complex operator+( const iT& a, - const standalone::c10::complex& b) { + const executorch::backends::aoti::slim::c10::complex& b) { return static_cast(a) + b; } template -constexpr standalone::c10::complex operator-( - const standalone::c10::complex& a, +constexpr executorch::backends::aoti::slim::c10::complex operator-( + const executorch::backends::aoti::slim::c10::complex& a, const iT& b) { return a - static_cast(b); } template -constexpr standalone::c10::complex operator-( +constexpr executorch::backends::aoti::slim::c10::complex operator-( const iT& a, - const standalone::c10::complex& b) { + const executorch::backends::aoti::slim::c10::complex& b) { return static_cast(a) - b; } template -constexpr standalone::c10::complex operator*( - const standalone::c10::complex& a, +constexpr executorch::backends::aoti::slim::c10::complex operator*( + const executorch::backends::aoti::slim::c10::complex& a, const iT& b) { return a * static_cast(b); } template -constexpr standalone::c10::complex operator*( +constexpr executorch::backends::aoti::slim::c10::complex operator*( const iT& a, - const standalone::c10::complex& b) { + const executorch::backends::aoti::slim::c10::complex& b) { return static_cast(a) * b; } template -constexpr standalone::c10::complex operator/( - const standalone::c10::complex& a, +constexpr executorch::backends::aoti::slim::c10::complex operator/( + const executorch::backends::aoti::slim::c10::complex& a, const iT& b) { return a / static_cast(b); } template -constexpr standalone::c10::complex operator/( +constexpr executorch::backends::aoti::slim::c10::complex operator/( const iT& a, - const standalone::c10::complex& b) { + const executorch::backends::aoti::slim::c10::complex& b) { return static_cast(a) / b; } @@ -545,7 +548,7 @@ std::basic_istream& operator>>( return is; } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 // std functions // @@ -554,17 +557,18 @@ std::basic_istream& operator>>( namespace std { template -constexpr T real(const standalone::c10::complex& z) { +constexpr T real(const executorch::backends::aoti::slim::c10::complex& z) { return z.real(); } template -constexpr T imag(const standalone::c10::complex& z) { +constexpr T imag(const executorch::backends::aoti::slim::c10::complex& z) { return z.imag(); } template -STANDALONE_HOST_DEVICE T abs(const standalone::c10::complex& z) { +STANDALONE_HOST_DEVICE T +abs(const executorch::backends::aoti::slim::c10::complex& z) { #if defined(__CUDACC__) || defined(__HIPCC__) return thrust::abs(static_cast>(z)); #else @@ -579,14 +583,15 @@ STANDALONE_HOST_DEVICE T abs(const standalone::c10::complex& z) { #endif template -STANDALONE_HOST_DEVICE T arg(const standalone::c10::complex& z) { +STANDALONE_HOST_DEVICE T +arg(const executorch::backends::aoti::slim::c10::complex& z) { return ROCm_Bug(std)::atan2(std::imag(z), std::real(z)); } #undef ROCm_Bug template -constexpr T norm(const standalone::c10::complex& z) { +constexpr T norm(const executorch::backends::aoti::slim::c10::complex& z) { return z.real() * z.real() + z.imag() * z.imag(); } @@ -596,11 +601,12 @@ constexpr T norm(const standalone::c10::complex& z) { // constexpr std::complex conj( DoubleOrInteger z ); // constexpr std::complex conj( long double z ); // These are not implemented -// TODO(@zasdfgbnm): implement them as standalone::c10::conj +// TODO(@zasdfgbnm): implement them as +// executorch::backends::aoti::slim::c10::conj template -constexpr standalone::c10::complex conj( - const standalone::c10::complex& z) { - return standalone::c10::complex(z.real(), -z.imag()); +constexpr executorch::backends::aoti::slim::c10::complex conj( + const executorch::backends::aoti::slim::c10::complex& z) { + return executorch::backends::aoti::slim::c10::complex(z.real(), -z.imag()); } // Thrust does not have complex --> complex version of thrust::proj, @@ -608,11 +614,12 @@ constexpr standalone::c10::complex conj( // TODO(@zasdfgbnm): implement it by ourselves // There is no standalone version of std::polar, because std::polar always -// returns std::complex. Use standalone::c10::polar instead; +// returns std::complex. Use executorch::backends::aoti::slim::c10::polar +// instead; } // namespace std -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { template STANDALONE_HOST_DEVICE complex polar(const T& r, const T& theta = T()) { @@ -639,12 +646,12 @@ struct alignas(4) complex { const Half& imag) : real_(real), imag_(imag) {} STANDALONE_HOST_DEVICE inline complex( - const standalone::c10::complex& value) + const executorch::backends::aoti::slim::c10::complex& value) : real_(value.real()), imag_(value.imag()) {} // Conversion operator - inline STANDALONE_HOST_DEVICE operator standalone::c10::complex() - const { + inline STANDALONE_HOST_DEVICE + operator executorch::backends::aoti::slim::c10::complex() const { return {real_, imag_}; } @@ -678,7 +685,7 @@ struct alignas(4) complex { } }; -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 STANDALONE_CLANG_DIAGNOSTIC_POP() diff --git a/backends/aoti/slim/c10/util/complex_math.h b/backends/aoti/slim/c10/util/complex_math.h index 56fc84fe90b..3ada9db6f00 100644 --- a/backends/aoti/slim/c10/util/complex_math.h +++ b/backends/aoti/slim/c10/util/complex_math.h @@ -5,52 +5,52 @@ #include -namespace standalone::c10::complex_math { +namespace executorch::backends::aoti::slim::c10::complex_math { // Exponential functions template -STANDALONE_HOST_DEVICE inline standalone::c10::complex exp( - const standalone::c10::complex& x) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +exp(const executorch::backends::aoti::slim::c10::complex& x) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>( + return static_cast>( thrust::exp(static_cast>(x))); #else - return static_cast>( + return static_cast>( std::exp(static_cast>(x))); #endif } template -STANDALONE_HOST_DEVICE inline standalone::c10::complex log( - const standalone::c10::complex& x) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +log(const executorch::backends::aoti::slim::c10::complex& x) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>( + return static_cast>( thrust::log(static_cast>(x))); #else - return static_cast>( + return static_cast>( std::log(static_cast>(x))); #endif } template -STANDALONE_HOST_DEVICE inline standalone::c10::complex log10( - const standalone::c10::complex& x) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +log10(const executorch::backends::aoti::slim::c10::complex& x) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>( + return static_cast>( thrust::log10(static_cast>(x))); #else - return static_cast>( + return static_cast>( std::log10(static_cast>(x))); #endif } template -STANDALONE_HOST_DEVICE inline standalone::c10::complex log2( - const standalone::c10::complex& x) { - const standalone::c10::complex log2 = - standalone::c10::complex(::log(2.0), 0.0); - return standalone::c10::complex_math::log(x) / log2; +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +log2(const executorch::backends::aoti::slim::c10::complex& x) { + const executorch::backends::aoti::slim::c10::complex log2 = + executorch::backends::aoti::slim::c10::complex(::log(2.0), 0.0); + return executorch::backends::aoti::slim::c10::complex_math::log(x) / log2; } // Power functions @@ -59,34 +59,36 @@ STANDALONE_HOST_DEVICE inline standalone::c10::complex log2( (defined(__GLIBCXX__) && !defined(_GLIBCXX11_USE_C99_COMPLEX)) namespace _detail { template -standalone::c10::complex compute_csqrt( - const standalone::c10::complex& z) { +executorch::backends::aoti::slim::c10::complex compute_csqrt( + const executorch::backends::aoti::slim::c10::complex& z) { constexpr auto half = T(.5); // Trust standard library to correctly handle infs and NaNs if (std::isinf(z.real()) || std::isinf(z.imag()) || std::isnan(z.real()) || std::isnan(z.imag())) { - return static_cast>( + return static_cast>( std::sqrt(static_cast>(z))); } // Special case for square root of pure imaginary values if (z.real() == T(0)) { if (z.imag() == T(0)) { - return standalone::c10::complex(T(0), z.imag()); + return executorch::backends::aoti::slim::c10::complex(T(0), z.imag()); } auto v = std::sqrt(half * std::abs(z.imag())); - return standalone::c10::complex(v, std::copysign(v, z.imag())); + return executorch::backends::aoti::slim::c10::complex( + v, std::copysign(v, z.imag())); } // At this point, z is non-zero and finite if (z.real() >= 0.0) { auto t = std::sqrt((z.real() + std::abs(z)) * half); - return standalone::c10::complex(t, half * (z.imag() / t)); + return executorch::backends::aoti::slim::c10::complex( + t, half * (z.imag() / t)); } auto t = std::sqrt((-z.real() + std::abs(z)) * half); - return standalone::c10::complex( + return executorch::backends::aoti::slim::c10::complex( half * std::abs(z.imag() / t), std::copysign(t, z.imag())); } @@ -95,58 +97,59 @@ standalone::c10::complex compute_csqrt( // cacos(z).re = 2*atan2(sqrt(1-z).re(), sqrt(1+z).re()) // cacos(z).im = asinh((sqrt(conj(1+z))*sqrt(1-z)).im()) template -standalone::c10::complex compute_cacos( - const standalone::c10::complex& z) { +executorch::backends::aoti::slim::c10::complex compute_cacos( + const executorch::backends::aoti::slim::c10::complex& z) { auto constexpr one = T(1); // Trust standard library to correctly handle infs and NaNs if (std::isinf(z.real()) || std::isinf(z.imag()) || std::isnan(z.real()) || std::isnan(z.imag())) { - return static_cast>( + return static_cast>( std::acos(static_cast>(z))); } - auto a = - compute_csqrt(standalone::c10::complex(one - z.real(), -z.imag())); - auto b = compute_csqrt(standalone::c10::complex(one + z.real(), z.imag())); - auto c = - compute_csqrt(standalone::c10::complex(one + z.real(), -z.imag())); + auto a = compute_csqrt(executorch::backends::aoti::slim::c10::complex( + one - z.real(), -z.imag())); + auto b = compute_csqrt(executorch::backends::aoti::slim::c10::complex( + one + z.real(), z.imag())); + auto c = compute_csqrt(executorch::backends::aoti::slim::c10::complex( + one + z.real(), -z.imag())); auto r = T(2) * std::atan2(a.real(), b.real()); // Explicitly unroll (a*c).imag() auto i = std::asinh(a.real() * c.imag() + a.imag() * c.real()); - return standalone::c10::complex(r, i); + return executorch::backends::aoti::slim::c10::complex(r, i); } -inline standalone::c10::complex sqrt( - const standalone::c10::complex& in) { +inline executorch::backends::aoti::slim::c10::complex sqrt( + const executorch::backends::aoti::slim::c10::complex& in) { return compute_csqrt(in); } -inline standalone::c10::complex sqrt( - const standalone::c10::complex& in) { +inline executorch::backends::aoti::slim::c10::complex sqrt( + const executorch::backends::aoti::slim::c10::complex& in) { return compute_csqrt(in); } -inline standalone::c10::complex acos( - const standalone::c10::complex& in) { +inline executorch::backends::aoti::slim::c10::complex acos( + const executorch::backends::aoti::slim::c10::complex& in) { return compute_cacos(in); } -inline standalone::c10::complex acos( - const standalone::c10::complex& in) { +inline executorch::backends::aoti::slim::c10::complex acos( + const executorch::backends::aoti::slim::c10::complex& in) { return compute_cacos(in); } } // namespace _detail #endif template -STANDALONE_HOST_DEVICE inline standalone::c10::complex sqrt( - const standalone::c10::complex& x) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +sqrt(const executorch::backends::aoti::slim::c10::complex& x) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>( + return static_cast>( thrust::sqrt(static_cast>(x))); #elif !( \ defined(_LIBCPP_VERSION) || \ (defined(__GLIBCXX__) && !defined(_GLIBCXX11_USE_C99_COMPLEX))) - return static_cast>( + return static_cast>( std::sqrt(static_cast>(x))); #else return _detail::sqrt(x); @@ -154,79 +157,84 @@ STANDALONE_HOST_DEVICE inline standalone::c10::complex sqrt( } template -STANDALONE_HOST_DEVICE inline standalone::c10::complex pow( - const standalone::c10::complex& x, - const standalone::c10::complex& y) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +pow(const executorch::backends::aoti::slim::c10::complex& x, + const executorch::backends::aoti::slim::c10::complex& y) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>(thrust::pow( - static_cast>(x), static_cast>(y))); + return static_cast>( + thrust::pow( + static_cast>(x), + static_cast>(y))); #else - return static_cast>(std::pow( - static_cast>(x), static_cast>(y))); + return static_cast>( + std::pow( + static_cast>(x), static_cast>(y))); #endif } template -STANDALONE_HOST_DEVICE inline standalone::c10::complex pow( - const standalone::c10::complex& x, - const T& y) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +pow(const executorch::backends::aoti::slim::c10::complex& x, const T& y) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>( + return static_cast>( thrust::pow(static_cast>(x), y)); #else - return static_cast>( + return static_cast>( std::pow(static_cast>(x), y)); #endif } template -STANDALONE_HOST_DEVICE inline standalone::c10::complex pow( - const T& x, - const standalone::c10::complex& y) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +pow(const T& x, const executorch::backends::aoti::slim::c10::complex& y) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>( + return static_cast>( thrust::pow(x, static_cast>(y))); #else - return static_cast>( + return static_cast>( std::pow(x, static_cast>(y))); #endif } template -STANDALONE_HOST_DEVICE inline standalone::c10::complex pow( - const standalone::c10::complex& x, - const standalone::c10::complex& y) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex< + decltype(T() * U())> +pow(const executorch::backends::aoti::slim::c10::complex& x, + const executorch::backends::aoti::slim::c10::complex& y) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>(thrust::pow( - static_cast>(x), static_cast>(y))); + return static_cast>( + thrust::pow( + static_cast>(x), + static_cast>(y))); #else - return static_cast>(std::pow( - static_cast>(x), static_cast>(y))); + return static_cast>( + std::pow( + static_cast>(x), static_cast>(y))); #endif } template -STANDALONE_HOST_DEVICE inline standalone::c10::complex pow( - const standalone::c10::complex& x, - const U& y) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex< + decltype(T() * U())> +pow(const executorch::backends::aoti::slim::c10::complex& x, const U& y) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>( + return static_cast>( thrust::pow(static_cast>(x), y)); #else - return static_cast>( + return static_cast>( std::pow(static_cast>(x), y)); #endif } template -STANDALONE_HOST_DEVICE inline standalone::c10::complex pow( - const T& x, - const standalone::c10::complex& y) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex< + decltype(T() * U())> +pow(const T& x, const executorch::backends::aoti::slim::c10::complex& y) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>( + return static_cast>( thrust::pow(x, static_cast>(y))); #else - return static_cast>( + return static_cast>( std::pow(x, static_cast>(y))); #endif } @@ -234,61 +242,61 @@ STANDALONE_HOST_DEVICE inline standalone::c10::complex pow( // Trigonometric functions template -STANDALONE_HOST_DEVICE inline standalone::c10::complex sin( - const standalone::c10::complex& x) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +sin(const executorch::backends::aoti::slim::c10::complex& x) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>( + return static_cast>( thrust::sin(static_cast>(x))); #else - return static_cast>( + return static_cast>( std::sin(static_cast>(x))); #endif } template -STANDALONE_HOST_DEVICE inline standalone::c10::complex cos( - const standalone::c10::complex& x) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +cos(const executorch::backends::aoti::slim::c10::complex& x) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>( + return static_cast>( thrust::cos(static_cast>(x))); #else - return static_cast>( + return static_cast>( std::cos(static_cast>(x))); #endif } template -STANDALONE_HOST_DEVICE inline standalone::c10::complex tan( - const standalone::c10::complex& x) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +tan(const executorch::backends::aoti::slim::c10::complex& x) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>( + return static_cast>( thrust::tan(static_cast>(x))); #else - return static_cast>( + return static_cast>( std::tan(static_cast>(x))); #endif } template -STANDALONE_HOST_DEVICE inline standalone::c10::complex asin( - const standalone::c10::complex& x) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +asin(const executorch::backends::aoti::slim::c10::complex& x) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>( + return static_cast>( thrust::asin(static_cast>(x))); #else - return static_cast>( + return static_cast>( std::asin(static_cast>(x))); #endif } template -STANDALONE_HOST_DEVICE inline standalone::c10::complex acos( - const standalone::c10::complex& x) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +acos(const executorch::backends::aoti::slim::c10::complex& x) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>( + return static_cast>( thrust::acos(static_cast>(x))); #elif !defined(_LIBCPP_VERSION) - return static_cast>( + return static_cast>( std::acos(static_cast>(x))); #else return _detail::acos(x); @@ -296,13 +304,13 @@ STANDALONE_HOST_DEVICE inline standalone::c10::complex acos( } template -STANDALONE_HOST_DEVICE inline standalone::c10::complex atan( - const standalone::c10::complex& x) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +atan(const executorch::backends::aoti::slim::c10::complex& x) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>( + return static_cast>( thrust::atan(static_cast>(x))); #else - return static_cast>( + return static_cast>( std::atan(static_cast>(x))); #endif } @@ -310,80 +318,80 @@ STANDALONE_HOST_DEVICE inline standalone::c10::complex atan( // Hyperbolic functions template -STANDALONE_HOST_DEVICE inline standalone::c10::complex sinh( - const standalone::c10::complex& x) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +sinh(const executorch::backends::aoti::slim::c10::complex& x) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>( + return static_cast>( thrust::sinh(static_cast>(x))); #else - return static_cast>( + return static_cast>( std::sinh(static_cast>(x))); #endif } template -STANDALONE_HOST_DEVICE inline standalone::c10::complex cosh( - const standalone::c10::complex& x) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +cosh(const executorch::backends::aoti::slim::c10::complex& x) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>( + return static_cast>( thrust::cosh(static_cast>(x))); #else - return static_cast>( + return static_cast>( std::cosh(static_cast>(x))); #endif } template -STANDALONE_HOST_DEVICE inline standalone::c10::complex tanh( - const standalone::c10::complex& x) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +tanh(const executorch::backends::aoti::slim::c10::complex& x) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>( + return static_cast>( thrust::tanh(static_cast>(x))); #else - return static_cast>( + return static_cast>( std::tanh(static_cast>(x))); #endif } template -STANDALONE_HOST_DEVICE inline standalone::c10::complex asinh( - const standalone::c10::complex& x) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +asinh(const executorch::backends::aoti::slim::c10::complex& x) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>( + return static_cast>( thrust::asinh(static_cast>(x))); #else - return static_cast>( + return static_cast>( std::asinh(static_cast>(x))); #endif } template -STANDALONE_HOST_DEVICE inline standalone::c10::complex acosh( - const standalone::c10::complex& x) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +acosh(const executorch::backends::aoti::slim::c10::complex& x) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>( + return static_cast>( thrust::acosh(static_cast>(x))); #else - return static_cast>( + return static_cast>( std::acosh(static_cast>(x))); #endif } template -STANDALONE_HOST_DEVICE inline standalone::c10::complex atanh( - const standalone::c10::complex& x) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +atanh(const executorch::backends::aoti::slim::c10::complex& x) { #if defined(__CUDACC__) || defined(__HIPCC__) - return static_cast>( + return static_cast>( thrust::atanh(static_cast>(x))); #else - return static_cast>( + return static_cast>( std::atanh(static_cast>(x))); #endif } template -STANDALONE_HOST_DEVICE inline standalone::c10::complex log1p( - const standalone::c10::complex& z) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +log1p(const executorch::backends::aoti::slim::c10::complex& z) { #if defined(__APPLE__) || defined(__MACOSX) || defined(__CUDACC__) || \ defined(__HIPCC__) // For Mac, the new implementation yielded a high relative error. Falling back @@ -420,7 +428,7 @@ STANDALONE_HOST_DEVICE inline standalone::c10::complex log1p( #else // CPU path // Based on https://github.com/numpy/numpy/pull/22611#issuecomment-1667945354 - standalone::c10::complex u = z + T(1); + executorch::backends::aoti::slim::c10::complex u = z + T(1); if (u == T(1)) { return z; } else { @@ -434,8 +442,8 @@ STANDALONE_HOST_DEVICE inline standalone::c10::complex log1p( } template -STANDALONE_HOST_DEVICE inline standalone::c10::complex expm1( - const standalone::c10::complex& z) { +STANDALONE_HOST_DEVICE inline executorch::backends::aoti::slim::c10::complex +expm1(const executorch::backends::aoti::slim::c10::complex& z) { // expm1(z) = exp(z) - 1 // Define z = x + i * y // f = e ^ (x + i * y) - 1 @@ -451,50 +459,50 @@ STANDALONE_HOST_DEVICE inline standalone::c10::complex expm1( return {er, ei}; } -} // namespace standalone::c10::complex_math - -using standalone::c10::complex_math::acos; -using standalone::c10::complex_math::acosh; -using standalone::c10::complex_math::asin; -using standalone::c10::complex_math::asinh; -using standalone::c10::complex_math::atan; -using standalone::c10::complex_math::atanh; -using standalone::c10::complex_math::cos; -using standalone::c10::complex_math::cosh; -using standalone::c10::complex_math::exp; -using standalone::c10::complex_math::expm1; -using standalone::c10::complex_math::log; -using standalone::c10::complex_math::log10; -using standalone::c10::complex_math::log1p; -using standalone::c10::complex_math::log2; -using standalone::c10::complex_math::pow; -using standalone::c10::complex_math::sin; -using standalone::c10::complex_math::sinh; -using standalone::c10::complex_math::sqrt; -using standalone::c10::complex_math::tan; -using standalone::c10::complex_math::tanh; +} // namespace executorch::backends::aoti::slim::c10::complex_math + +using executorch::backends::aoti::slim::c10::complex_math::acos; +using executorch::backends::aoti::slim::c10::complex_math::acosh; +using executorch::backends::aoti::slim::c10::complex_math::asin; +using executorch::backends::aoti::slim::c10::complex_math::asinh; +using executorch::backends::aoti::slim::c10::complex_math::atan; +using executorch::backends::aoti::slim::c10::complex_math::atanh; +using executorch::backends::aoti::slim::c10::complex_math::cos; +using executorch::backends::aoti::slim::c10::complex_math::cosh; +using executorch::backends::aoti::slim::c10::complex_math::exp; +using executorch::backends::aoti::slim::c10::complex_math::expm1; +using executorch::backends::aoti::slim::c10::complex_math::log; +using executorch::backends::aoti::slim::c10::complex_math::log10; +using executorch::backends::aoti::slim::c10::complex_math::log1p; +using executorch::backends::aoti::slim::c10::complex_math::log2; +using executorch::backends::aoti::slim::c10::complex_math::pow; +using executorch::backends::aoti::slim::c10::complex_math::sin; +using executorch::backends::aoti::slim::c10::complex_math::sinh; +using executorch::backends::aoti::slim::c10::complex_math::sqrt; +using executorch::backends::aoti::slim::c10::complex_math::tan; +using executorch::backends::aoti::slim::c10::complex_math::tanh; namespace std { -using standalone::c10::complex_math::acos; -using standalone::c10::complex_math::acosh; -using standalone::c10::complex_math::asin; -using standalone::c10::complex_math::asinh; -using standalone::c10::complex_math::atan; -using standalone::c10::complex_math::atanh; -using standalone::c10::complex_math::cos; -using standalone::c10::complex_math::cosh; -using standalone::c10::complex_math::exp; -using standalone::c10::complex_math::expm1; -using standalone::c10::complex_math::log; -using standalone::c10::complex_math::log10; -using standalone::c10::complex_math::log1p; -using standalone::c10::complex_math::log2; -using standalone::c10::complex_math::pow; -using standalone::c10::complex_math::sin; -using standalone::c10::complex_math::sinh; -using standalone::c10::complex_math::sqrt; -using standalone::c10::complex_math::tan; -using standalone::c10::complex_math::tanh; +using executorch::backends::aoti::slim::c10::complex_math::acos; +using executorch::backends::aoti::slim::c10::complex_math::acosh; +using executorch::backends::aoti::slim::c10::complex_math::asin; +using executorch::backends::aoti::slim::c10::complex_math::asinh; +using executorch::backends::aoti::slim::c10::complex_math::atan; +using executorch::backends::aoti::slim::c10::complex_math::atanh; +using executorch::backends::aoti::slim::c10::complex_math::cos; +using executorch::backends::aoti::slim::c10::complex_math::cosh; +using executorch::backends::aoti::slim::c10::complex_math::exp; +using executorch::backends::aoti::slim::c10::complex_math::expm1; +using executorch::backends::aoti::slim::c10::complex_math::log; +using executorch::backends::aoti::slim::c10::complex_math::log10; +using executorch::backends::aoti::slim::c10::complex_math::log1p; +using executorch::backends::aoti::slim::c10::complex_math::log2; +using executorch::backends::aoti::slim::c10::complex_math::pow; +using executorch::backends::aoti::slim::c10::complex_math::sin; +using executorch::backends::aoti::slim::c10::complex_math::sinh; +using executorch::backends::aoti::slim::c10::complex_math::sqrt; +using executorch::backends::aoti::slim::c10::complex_math::tan; +using executorch::backends::aoti::slim::c10::complex_math::tanh; } // namespace std diff --git a/backends/aoti/slim/c10/util/complex_utils.h b/backends/aoti/slim/c10/util/complex_utils.h index 5b29406a186..af6d8203c65 100644 --- a/backends/aoti/slim/c10/util/complex_utils.h +++ b/backends/aoti/slim/c10/util/complex_utils.h @@ -5,7 +5,7 @@ #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { template struct is_complex : public std::false_type {}; @@ -14,7 +14,8 @@ template struct is_complex> : public std::true_type {}; template -struct is_complex> : public std::true_type {}; +struct is_complex> + : public std::true_type {}; // Extract double from std::complex; is identity otherwise // TODO: Write in more idiomatic C++17 @@ -27,19 +28,20 @@ struct scalar_value_type> { using type = T; }; template -struct scalar_value_type> { +struct scalar_value_type> { using type = T; }; -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 namespace std { template -class numeric_limits> : public numeric_limits {}; +class numeric_limits> + : public numeric_limits {}; template -bool isnan(const standalone::c10::complex& v) { +bool isnan(const executorch::backends::aoti::slim::c10::complex& v) { return std::isnan(v.real()) || std::isnan(v.imag()); } diff --git a/backends/aoti/slim/c10/util/copysign.h b/backends/aoti/slim/c10/util/copysign.h index 1012934049c..ff0b0fcc847 100644 --- a/backends/aoti/slim/c10/util/copysign.h +++ b/backends/aoti/slim/c10/util/copysign.h @@ -3,7 +3,7 @@ #include #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { // Note: Explicit implementation of copysign for Half and BFloat16 // is needed to workaround g++-7/8 crash on aarch64, but also makes @@ -23,4 +23,4 @@ inline BFloat16 copysign(BFloat16 a, BFloat16 b) { return BFloat16((a.x & 0x7fff) | (b.x & 0x8000), BFloat16::from_bits()); } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/util/floating_point_utils.h b/backends/aoti/slim/c10/util/floating_point_utils.h index 259cb93b0a5..dbe208b05b9 100644 --- a/backends/aoti/slim/c10/util/floating_point_utils.h +++ b/backends/aoti/slim/c10/util/floating_point_utils.h @@ -4,7 +4,7 @@ #include #include -namespace standalone::c10::detail { +namespace executorch::backends::aoti::slim::c10::detail { STANDALONE_HOST_DEVICE inline float fp32_from_bits(uint32_t w) { #if defined(__OPENCL_VERSION__) @@ -14,7 +14,7 @@ STANDALONE_HOST_DEVICE inline float fp32_from_bits(uint32_t w) { #elif defined(__INTEL_COMPILER) return _castu32_f32(w); #else - return standalone::c10::bit_cast(w); + return executorch::backends::aoti::slim::c10::bit_cast(w); #endif } @@ -26,8 +26,8 @@ STANDALONE_HOST_DEVICE inline uint32_t fp32_to_bits(float f) { #elif defined(__INTEL_COMPILER) return _castf32_u32(f); #else - return standalone::c10::bit_cast(f); + return executorch::backends::aoti::slim::c10::bit_cast(f); #endif } -} // namespace standalone::c10::detail +} // namespace executorch::backends::aoti::slim::c10::detail diff --git a/backends/aoti/slim/c10/util/generic_math.h b/backends/aoti/slim/c10/util/generic_math.h index 00bb4265d9d..6cc9ec72bec 100644 --- a/backends/aoti/slim/c10/util/generic_math.h +++ b/backends/aoti/slim/c10/util/generic_math.h @@ -6,20 +6,23 @@ #if defined(__CUDA_ARCH__) #include -#define STANDALONE_COMPAT_COPYSIGN standalone::c10::cuda::compat::copysign +#define STANDALONE_COMPAT_COPYSIGN \ + executorch::backends::aoti::slim::c10::cuda::compat::copysign // TODO: rocm is not supported yet // #elif defined(__HIPCC__) // #include -// #define STANDALONE_COMPAT_COPYSIGN standalone::c10::hip::compat::copysign +// #define STANDALONE_COMPAT_COPYSIGN +// executorch::backends::aoti::slim::c10::hip::compat::copysign #else #include -#define STANDALONE_COMPAT_COPYSIGN standalone::c10::copysign +#define STANDALONE_COMPAT_COPYSIGN \ + executorch::backends::aoti::slim::c10::copysign #endif // The functions in this file should be header-only as it is used under // ABI-compatibility mode. -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { // NOTE: [Floor Division in Python] // Python's __floordiv__ operator is more complicated than just floor(a / b). @@ -61,7 +64,7 @@ inline STANDALONE_HOST_DEVICE scalar_t div_floor_floating( template inline STANDALONE_HOST_DEVICE scalar_t div_floor_integer(scalar_t a, scalar_t b) { - if (standalone::c10::signs_differ(a, b)) { + if (executorch::backends::aoti::slim::c10::signs_differ(a, b)) { // Subtracts one from the results of truncation division if the // divisor and dividend have different sign(bit)s and the remainder of // the division is nonzero @@ -102,4 +105,4 @@ inline STANDALONE_HOST_DEVICE scalar_t div_mod(scalar_t a, scalar_t b) { return mod; } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/util/irange.h b/backends/aoti/slim/c10/util/irange.h index 0d10f373a04..75c8b48d1ca 100644 --- a/backends/aoti/slim/c10/util/irange.h +++ b/backends/aoti/slim/c10/util/irange.h @@ -9,7 +9,7 @@ #include #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { namespace detail { @@ -48,9 +48,9 @@ struct integer_iterator { constexpr bool operator==(const integer_iterator& other) const { if constexpr (one_sided) { // Range-for loops' end test is `begin != end`, not `begin < - // end`. To handle `standalone::c10::irange(n)` where n < 0 (which - // should be empty), we just make `begin != end` fail whenever `end` is - // negative. + // end`. To handle `executorch::backends::aoti::slim::c10::irange(n)` + // where n < 0 (which should be empty), we just make `begin != end` fail + // whenever `end` is negative. return is_negative(other.value) || value == other.value; } else { return value == other.value; @@ -120,4 +120,4 @@ constexpr integer_range irange(Integer end) { return {Integer(), end}; } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/util/llvmMathExtras.h b/backends/aoti/slim/c10/util/llvmMathExtras.h index 0b4f92c44c6..a42423d009d 100644 --- a/backends/aoti/slim/c10/util/llvmMathExtras.h +++ b/backends/aoti/slim/c10/util/llvmMathExtras.h @@ -56,7 +56,7 @@ unsigned char _BitScanReverse64(unsigned long* _Index, unsigned __int64 _Mask); } #endif -namespace standalone::c10::llvm { +namespace executorch::backends::aoti::slim::c10::llvm { /// The behavior an operation has on an input of 0. enum ZeroBehavior { /// The returned value is undefined. @@ -620,7 +620,7 @@ inline double BitsToDouble(uint64_t Bits) { /// This function takes a 32-bit integer and returns the bit equivalent float. inline float BitsToFloat(uint32_t Bits) { // TODO: Use std::bit_cast once C++20 becomes available. - return standalone::c10::bit_cast(Bits); + return executorch::backends::aoti::slim::c10::bit_cast(Bits); } /// This function takes a double and returns the bit equivalent 64-bit integer. @@ -896,4 +896,4 @@ SaturatingMultiplyAdd(T X, T Y, T A, bool* ResultOverflowed = nullptr) { /// Use this rather than HUGE_VALF; the latter causes warnings on MSVC. extern const float huge_valf; -} // namespace standalone::c10::llvm +} // namespace executorch::backends::aoti::slim::c10::llvm diff --git a/backends/aoti/slim/c10/util/overflows.h b/backends/aoti/slim/c10/util/overflows.h index 5f636cd1a75..df2502d7910 100644 --- a/backends/aoti/slim/c10/util/overflows.h +++ b/backends/aoti/slim/c10/util/overflows.h @@ -8,7 +8,7 @@ #include #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { // In some versions of MSVC, there will be a compiler error when building. // C4146: unary minus operator applied to unsigned type, result still unsigned // C4804: unsafe use of type 'bool' in operation @@ -50,11 +50,12 @@ overflows(From f, bool strict_unsigned = false) { // `a + 255 * b`. if (!strict_unsigned) { return greater_than_max(f) || - (standalone::c10::is_negative(f) && + (executorch::backends::aoti::slim::c10::is_negative(f) && -static_cast(f) > static_cast(limit::max())); } } - return standalone::c10::less_than_lowest(f) || greater_than_max(f); + return executorch::backends::aoti::slim::c10::less_than_lowest(f) || + greater_than_max(f); } template @@ -97,4 +98,4 @@ std::enable_if_t::value, bool> overflows( typename scalar_value_type::type, typename From::value_type>(f.imag(), strict_unsigned); } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/util/qint32.h b/backends/aoti/slim/c10/util/qint32.h index 7951bfd240a..2d3f72e9a10 100644 --- a/backends/aoti/slim/c10/util/qint32.h +++ b/backends/aoti/slim/c10/util/qint32.h @@ -3,7 +3,7 @@ #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { /** * qint32 is for signed 32 bit quantized Tensors @@ -15,4 +15,4 @@ struct alignas(4) qint32 { STANDALONE_HOST_DEVICE explicit qint32(int32_t val) : val_(val) {} }; -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/util/qint8.h b/backends/aoti/slim/c10/util/qint8.h index 53c1fdf465a..f08ce5bfc3f 100644 --- a/backends/aoti/slim/c10/util/qint8.h +++ b/backends/aoti/slim/c10/util/qint8.h @@ -3,7 +3,7 @@ #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { /** * This is the data type for quantized Tensors. Right now we only have @@ -17,4 +17,4 @@ struct alignas(1) qint8 { STANDALONE_HOST_DEVICE explicit qint8(int8_t val) : val_(val) {} }; -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/util/quint2x4.h b/backends/aoti/slim/c10/util/quint2x4.h index 009802be7f2..e80848cd9eb 100644 --- a/backends/aoti/slim/c10/util/quint2x4.h +++ b/backends/aoti/slim/c10/util/quint2x4.h @@ -3,7 +3,7 @@ #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { /** * quint2x4 is for un-signed 2 bit quantized Tensors that are packed to byte @@ -16,4 +16,4 @@ struct alignas(1) quint2x4 { STANDALONE_HOST_DEVICE explicit quint2x4(uint8_t val) : val_(val) {} }; -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/util/quint4x2.h b/backends/aoti/slim/c10/util/quint4x2.h index b6812ab8fde..1c2f8350596 100644 --- a/backends/aoti/slim/c10/util/quint4x2.h +++ b/backends/aoti/slim/c10/util/quint4x2.h @@ -3,7 +3,7 @@ #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { /** * quint4x2 is for un-signed 4 bit quantized Tensors that are packed to byte @@ -16,4 +16,4 @@ struct alignas(1) quint4x2 { STANDALONE_HOST_DEVICE explicit quint4x2(uint8_t val) : val_(val) {} }; -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/util/quint8.h b/backends/aoti/slim/c10/util/quint8.h index 4019765ca4a..e8649bc4fa8 100644 --- a/backends/aoti/slim/c10/util/quint8.h +++ b/backends/aoti/slim/c10/util/quint8.h @@ -3,7 +3,7 @@ #include -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { /** * quint8 is for unsigned 8 bit quantized Tensors @@ -15,4 +15,4 @@ struct alignas(1) quint8 { STANDALONE_HOST_DEVICE explicit quint8(uint8_t val) : val_(val) {} }; -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/c10/util/safe_numerics.h b/backends/aoti/slim/c10/util/safe_numerics.h index 26a05c636aa..df0aa6e7c5c 100644 --- a/backends/aoti/slim/c10/util/safe_numerics.h +++ b/backends/aoti/slim/c10/util/safe_numerics.h @@ -12,7 +12,7 @@ #define STANDALONE_HAS_BUILTIN_OVERFLOW() (1) #endif -namespace standalone::c10 { +namespace executorch::backends::aoti::slim::c10 { STANDALONE_ALWAYS_INLINE bool add_overflows(uint64_t a, uint64_t b, uint64_t* out) { @@ -40,8 +40,8 @@ mul_overflows(uint64_t a, uint64_t b, uint64_t* out) { *out = a * b; // This test isnt exact, but avoids doing integer division return ( - (standalone::c10::llvm::countLeadingZeros(a) + - standalone::c10::llvm::countLeadingZeros(b)) < 64); + (executorch::backends::aoti::slim::c10::llvm::countLeadingZeros(a) + + executorch::backends::aoti::slim::c10::llvm::countLeadingZeros(b)) < 64); #endif } @@ -65,7 +65,8 @@ bool safe_multiplies_u64(It first, It last, uint64_t* out) { uint64_t prod = 1; bool overflow = false; for (; first != last; ++first) { - overflow |= standalone::c10::mul_overflows(prod, *first, &prod); + overflow |= executorch::backends::aoti::slim::c10::mul_overflows( + prod, *first, &prod); } *out = prod; return overflow; @@ -78,7 +79,7 @@ bool safe_multiplies_u64(It first, It last, uint64_t* out) { prod *= x; // log2(0) isn't valid, so need to track it specially is_zero |= (x == 0); - prod_log2 += standalone::c10::llvm::Log2_64_Ceil(x); + prod_log2 += executorch::backends::aoti::slim::c10::llvm::Log2_64_Ceil(x); } *out = prod; // This test isnt exact, but avoids doing integer division @@ -91,4 +92,4 @@ bool safe_multiplies_u64(const Container& c, uint64_t* out) { return safe_multiplies_u64(c.begin(), c.end(), out); } -} // namespace standalone::c10 +} // namespace executorch::backends::aoti::slim::c10 diff --git a/backends/aoti/slim/core/SlimTensor.h b/backends/aoti/slim/core/SlimTensor.h index 69ac4fec65f..9021e2db922 100644 --- a/backends/aoti/slim/core/SlimTensor.h +++ b/backends/aoti/slim/core/SlimTensor.h @@ -18,15 +18,15 @@ #include #include -namespace standalone::slim { +namespace executorch::backends::aoti::slim { class SlimTensor { public: SlimTensor( Storage&& storage, - standalone::c10::IntArrayRef sizes, - standalone::c10::IntArrayRef strides, - standalone::c10::ScalarType dtype, + executorch::backends::aoti::slim::c10::IntArrayRef sizes, + executorch::backends::aoti::slim::c10::IntArrayRef strides, + executorch::backends::aoti::slim::c10::ScalarType dtype, int64_t storage_offset = 0) : storage_(std::move(storage)), storage_offset_(storage_offset), @@ -39,7 +39,7 @@ class SlimTensor { : storage_(Storage()), storage_offset_(0), numel_(0), - dtype_(standalone::c10::ScalarType::Float), + dtype_(executorch::backends::aoti::slim::c10::ScalarType::Float), is_contiguous_(true) { sizes_and_strides_.set_sizes({0}); sizes_and_strides_.set_strides({1}); @@ -67,42 +67,42 @@ class SlimTensor { } size_t itemsize() const { - return standalone::c10::elementSize(dtype_); + return executorch::backends::aoti::slim::c10::elementSize(dtype_); } - standalone::c10::IntArrayRef sizes() const { + executorch::backends::aoti::slim::c10::IntArrayRef sizes() const { return sizes_and_strides_.sizes_arrayref(); } int64_t size(int64_t dim) const { - int64_t wrapped_dim = - standalone::c10::maybe_wrap_dim(dim, static_cast(this->dim())); + int64_t wrapped_dim = executorch::backends::aoti::slim::c10::maybe_wrap_dim( + dim, static_cast(this->dim())); return sizes_and_strides_.size_at(static_cast(wrapped_dim)); } - standalone::c10::IntArrayRef strides() const { + executorch::backends::aoti::slim::c10::IntArrayRef strides() const { return sizes_and_strides_.strides_arrayref(); } int64_t stride(int64_t dim) const { - int64_t wrapped_dim = - standalone::c10::maybe_wrap_dim(dim, static_cast(this->dim())); + int64_t wrapped_dim = executorch::backends::aoti::slim::c10::maybe_wrap_dim( + dim, static_cast(this->dim())); return sizes_and_strides_.stride_at(static_cast(wrapped_dim)); } - standalone::c10::ScalarType dtype() const { + executorch::backends::aoti::slim::c10::ScalarType dtype() const { return dtype_; } - const standalone::c10::Device& device() const { + const executorch::backends::aoti::slim::c10::Device& device() const { return storage_->device(); } - standalone::c10::DeviceType device_type() const { + executorch::backends::aoti::slim::c10::DeviceType device_type() const { return storage_->device().type(); } - standalone::c10::DeviceIndex device_index() const { + executorch::backends::aoti::slim::c10::DeviceIndex device_index() const { return storage_->device().index(); } @@ -149,8 +149,8 @@ class SlimTensor { } void set_sizes_and_strides( - standalone::c10::IntArrayRef sizes, - standalone::c10::IntArrayRef strides, + executorch::backends::aoti::slim::c10::IntArrayRef sizes, + executorch::backends::aoti::slim::c10::IntArrayRef strides, std::optional storage_offset = std::nullopt) { const int64_t new_dim = static_cast(sizes.size()); STANDALONE_CHECK( @@ -175,7 +175,7 @@ class SlimTensor { if (dim == new_dim - 1) { new_strides[dim] = 1; } else { - overflowed |= standalone::c10::mul_overflows( + overflowed |= executorch::backends::aoti::slim::c10::mul_overflows( new_strides[dim + 1], std::max(new_sizes[dim + 1], 1), &new_strides[dim]); @@ -195,20 +195,24 @@ class SlimTensor { refresh_contiguous(); } - void set_sizes_contiguous(standalone::c10::IntArrayRef new_size) { + void set_sizes_contiguous( + executorch::backends::aoti::slim::c10::IntArrayRef new_size) { sizes_and_strides_.set_sizes(new_size); refresh_numel(); - empty_tensor_restride(standalone::c10::MemoryFormat::Contiguous); + empty_tensor_restride( + executorch::backends::aoti::slim::c10::MemoryFormat::Contiguous); } - void empty_tensor_restride(standalone::c10::MemoryFormat memory_format); + void empty_tensor_restride( + executorch::backends::aoti::slim::c10::MemoryFormat memory_format); SlimTensor resize_( - standalone::c10::IntArrayRef sizes, + executorch::backends::aoti::slim::c10::IntArrayRef sizes, std::optional optional_memory_format); // Conversion operations - SlimTensor to(const standalone::c10::Device& device) const { + SlimTensor to( + const executorch::backends::aoti::slim::c10::Device& device) const { if (device == storage_->device()) { return *this; } @@ -230,7 +234,7 @@ class SlimTensor { return to(DEFAULT_CUDA_DEVICE); } - SlimTensor to(standalone::c10::ScalarType dtype) const { + SlimTensor to(executorch::backends::aoti::slim::c10::ScalarType dtype) const { STANDALONE_CHECK(false, "TBD: to(dtype)"); } @@ -252,7 +256,8 @@ class SlimTensor { // Case 2: At least one tensor is non-contiguous, perform element-wise copy // that respects both source and destination strides. - const size_t elem_size = standalone::c10::elementSize(dtype_); + const size_t elem_size = + executorch::backends::aoti::slim::c10::elementSize(dtype_); char* dst_data = static_cast(this->data_ptr()); const char* src_data = static_cast(other.data_ptr()); @@ -372,7 +377,8 @@ class SlimTensor { } } else { // Handle non-contiguous tensors by respecting strides - const size_t elem_size = standalone::c10::elementSize(this->dtype_); + const size_t elem_size = + executorch::backends::aoti::slim::c10::elementSize(this->dtype_); char* base_data = static_cast(this->data_ptr()); std::vector counter(this->dim(), 0); @@ -403,41 +409,43 @@ class SlimTensor { }; switch (this->dtype()) { - case standalone::c10::ScalarType::Double: + case executorch::backends::aoti::slim::c10::ScalarType::Double: fill_value(value.to()); break; - case standalone::c10::ScalarType::Float: + case executorch::backends::aoti::slim::c10::ScalarType::Float: fill_value(value.to()); break; - case standalone::c10::ScalarType::Half: - fill_value(value.to()); + case executorch::backends::aoti::slim::c10::ScalarType::Half: + fill_value(value.to()); break; - case standalone::c10::ScalarType::BFloat16: - fill_value(value.to()); + case executorch::backends::aoti::slim::c10::ScalarType::BFloat16: + fill_value(value.to()); break; - case standalone::c10::ScalarType::Long: + case executorch::backends::aoti::slim::c10::ScalarType::Long: fill_value(value.to()); break; - case standalone::c10::ScalarType::Int: + case executorch::backends::aoti::slim::c10::ScalarType::Int: fill_value(value.to()); break; - case standalone::c10::ScalarType::Short: + case executorch::backends::aoti::slim::c10::ScalarType::Short: fill_value(value.to()); break; - case standalone::c10::ScalarType::Char: + case executorch::backends::aoti::slim::c10::ScalarType::Char: fill_value(value.to()); break; - case standalone::c10::ScalarType::Byte: + case executorch::backends::aoti::slim::c10::ScalarType::Byte: fill_value(value.to()); break; - case standalone::c10::ScalarType::Bool: + case executorch::backends::aoti::slim::c10::ScalarType::Bool: fill_value(value.to()); break; - case standalone::c10::ScalarType::ComplexFloat: - fill_value(value.to>()); + case executorch::backends::aoti::slim::c10::ScalarType::ComplexFloat: + fill_value( + value.to>()); break; - case standalone::c10::ScalarType::ComplexDouble: - fill_value(value.to>()); + case executorch::backends::aoti::slim::c10::ScalarType::ComplexDouble: + fill_value( + value.to>()); break; default: STANDALONE_CHECK(false, "fill_: Unsupported dtype"); @@ -452,34 +460,38 @@ class SlimTensor { SlimTensor clone_contiguous() const { std::vector contig_strides = - standalone::slim::compute_contiguous_strides(this->sizes()); + executorch::backends::aoti::slim::compute_contiguous_strides( + this->sizes()); return _clone_impl( this->sizes(), contig_strides, this->dtype(), this->device()); } // View operations SlimTensor as_strided( - standalone::c10::IntArrayRef sizes, - standalone::c10::IntArrayRef strides, + executorch::backends::aoti::slim::c10::IntArrayRef sizes, + executorch::backends::aoti::slim::c10::IntArrayRef strides, int64_t storage_offset) const; SlimTensor as_strided_( - standalone::c10::IntArrayRef sizes, - standalone::c10::IntArrayRef strides, + executorch::backends::aoti::slim::c10::IntArrayRef sizes, + executorch::backends::aoti::slim::c10::IntArrayRef strides, int64_t storage_offset); - SlimTensor permute(standalone::c10::IntArrayRef dims) const; + SlimTensor permute( + executorch::backends::aoti::slim::c10::IntArrayRef dims) const; // Transpose operations SlimTensor transpose() const; SlimTensor transpose(int64_t dim0, int64_t dim1) const; SlimTensor t() const; - SlimTensor reshape(standalone::c10::IntArrayRef proposed_shape) const; + SlimTensor reshape( + executorch::backends::aoti::slim::c10::IntArrayRef proposed_shape) const; SlimTensor narrow(int64_t dim, int64_t start, int64_t length) const; // Generic element access returning SlimTensor - SlimTensor operator[](standalone::c10::IntArrayRef indices) const { + SlimTensor operator[]( + executorch::backends::aoti::slim::c10::IntArrayRef indices) const { STANDALONE_CHECK( indices.size() <= this->dim(), "Number of indices (", @@ -494,7 +506,7 @@ class SlimTensor { for (size_t i = 0; i < indices.size(); ++i) { int64_t idx = indices[i]; int64_t size = this->size(i); - idx = standalone::c10::maybe_wrap_dim(idx, size); + idx = executorch::backends::aoti::slim::c10::maybe_wrap_dim(idx, size); linear_index += idx * this->stride(i); } // Create 0-dimensional tensor pointing to the indexed element @@ -511,7 +523,7 @@ class SlimTensor { for (size_t i = 0; i < indices.size(); ++i) { int64_t idx = indices[i]; int64_t size = this->size(i); - idx = standalone::c10::maybe_wrap_dim(idx, size); + idx = executorch::backends::aoti::slim::c10::maybe_wrap_dim(idx, size); offset_adjustment += idx * this->stride(i); } @@ -533,41 +545,43 @@ class SlimTensor { // Convenience overload for single index SlimTensor operator[](int64_t index) const { - return (*this)[standalone::c10::IntArrayRef{index}]; + return (*this)[executorch::backends::aoti::slim::c10::IntArrayRef{index}]; } // Convenience overloads for common multi-dimensional cases SlimTensor operator[](std::initializer_list indices) const { - return (*this)[standalone::c10::IntArrayRef(indices)]; + return (*this)[executorch::backends::aoti::slim::c10::IntArrayRef(indices)]; } // Extract scalar value from 0-dimensional tensor - standalone::c10::Scalar item() const { + executorch::backends::aoti::slim::c10::Scalar item() const { switch (this->dtype()) { - case standalone::c10::ScalarType::Double: + case executorch::backends::aoti::slim::c10::ScalarType::Double: return this->item(); - case standalone::c10::ScalarType::Float: + case executorch::backends::aoti::slim::c10::ScalarType::Float: return this->item(); - case standalone::c10::ScalarType::Half: - return this->item(); - case standalone::c10::ScalarType::BFloat16: - return this->item(); - case standalone::c10::ScalarType::Long: + case executorch::backends::aoti::slim::c10::ScalarType::Half: + return this->item(); + case executorch::backends::aoti::slim::c10::ScalarType::BFloat16: + return this->item(); + case executorch::backends::aoti::slim::c10::ScalarType::Long: return this->item(); - case standalone::c10::ScalarType::Int: + case executorch::backends::aoti::slim::c10::ScalarType::Int: return this->item(); - case standalone::c10::ScalarType::Short: + case executorch::backends::aoti::slim::c10::ScalarType::Short: return this->item(); - case standalone::c10::ScalarType::Char: + case executorch::backends::aoti::slim::c10::ScalarType::Char: return this->item(); - case standalone::c10::ScalarType::Byte: + case executorch::backends::aoti::slim::c10::ScalarType::Byte: return this->item(); - case standalone::c10::ScalarType::Bool: + case executorch::backends::aoti::slim::c10::ScalarType::Bool: return this->item(); - case standalone::c10::ScalarType::ComplexFloat: - return this->item>(); - case standalone::c10::ScalarType::ComplexDouble: - return this->item>(); + case executorch::backends::aoti::slim::c10::ScalarType::ComplexFloat: + return this + ->item>(); + case executorch::backends::aoti::slim::c10::ScalarType::ComplexDouble: + return this + ->item>(); default: STANDALONE_CHECK(false, "item(): Unsupported dtype"); } @@ -589,10 +603,10 @@ class SlimTensor { private: SlimTensor _clone_impl( - standalone::c10::IntArrayRef sizes, - standalone::c10::IntArrayRef strides, - standalone::c10::ScalarType dtype, - const standalone::c10::Device& device) const { + executorch::backends::aoti::slim::c10::IntArrayRef sizes, + executorch::backends::aoti::slim::c10::IntArrayRef strides, + executorch::backends::aoti::slim::c10::ScalarType dtype, + const executorch::backends::aoti::slim::c10::Device& device) const { Storage storage = new_storage(sizes, strides, dtype, device); SlimTensor result = SlimTensor(std::move(storage), sizes, strides, dtype, 0); @@ -605,7 +619,7 @@ class SlimTensor { } bool compute_is_contiguous() const { - return standalone::c10::_compute_contiguous( + return executorch::backends::aoti::slim::c10::_compute_contiguous( sizes_and_strides_.sizes_arrayref(), sizes_and_strides_.strides_arrayref(), numel_); @@ -619,19 +633,27 @@ class SlimTensor { Storage storage_; // device_type_ and device_index_ are stored in storage_ int64_t storage_offset_{0}; - standalone::c10::SizesAndStrides sizes_and_strides_; + executorch::backends::aoti::slim::c10::SizesAndStrides sizes_and_strides_; // If sizes and strides are empty, the numel is 1!! However, most of the // time, we will immediately set sizes to {0} and reset numel to 0. // (Can't do that in the default initializers, because there's no way to // spell "allocate a one-element array" for strides_). size_t numel_{1}; - standalone::c10::ScalarType dtype_; + executorch::backends::aoti::slim::c10::ScalarType dtype_; bool is_contiguous_{true}; // NOLINTNEXTLINE(clang-diagnostic-unused-private-field) std::array reserved_{0}; // padding to align to 8 bytes }; -} // namespace standalone::slim +} // namespace executorch::backends::aoti::slim + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::backends::aoti::slim::SlimTensor; +} // namespace executor +} // namespace torch #include #include diff --git a/backends/aoti/slim/core/SlimTensorResize-incl.h b/backends/aoti/slim/core/SlimTensorResize-incl.h index 64c976aa5d8..bfc7b149d5e 100644 --- a/backends/aoti/slim/core/SlimTensorResize-incl.h +++ b/backends/aoti/slim/core/SlimTensorResize-incl.h @@ -6,9 +6,9 @@ #include #include -namespace standalone::slim { +namespace executorch::backends::aoti::slim { inline void SlimTensor::empty_tensor_restride( - standalone::c10::MemoryFormat memory_format) { + executorch::backends::aoti::slim::c10::MemoryFormat memory_format) { #ifdef DEBUG STANDALONE_INTERNAL_ASSERT( compute_numel() == numel_, @@ -16,7 +16,7 @@ inline void SlimTensor::empty_tensor_restride( "called before setting correct numel"); #endif switch (memory_format) { - case standalone::c10::MemoryFormat::Contiguous: { + case executorch::backends::aoti::slim::c10::MemoryFormat::Contiguous: { // dim_ is a virtual call, don't repeat it const auto dim_ = dim(); sizes_and_strides_.resize(dim_); @@ -25,7 +25,7 @@ inline void SlimTensor::empty_tensor_restride( const auto last_idx = dim_ - 1; sizes_and_strides_.stride_at_unchecked(last_idx) = 1; for (int64_t i = static_cast(last_idx) - 1; i >= 0; --i) { - overflowed |= standalone::c10::mul_overflows( + overflowed |= executorch::backends::aoti::slim::c10::mul_overflows( sizes_and_strides_.stride_at_unchecked(i + 1), std::max(sizes_and_strides_.size_at_unchecked(i + 1), 1), std::addressof(sizes_and_strides_.stride_at_unchecked(i))); @@ -34,24 +34,24 @@ inline void SlimTensor::empty_tensor_restride( } break; } - case standalone::c10::MemoryFormat::ChannelsLast: { + case executorch::backends::aoti::slim::c10::MemoryFormat::ChannelsLast: { STANDALONE_CHECK( dim() == 4, "required rank 4 tensor to use channels_last format"); set_sizes_and_strides(sizes(), get_channels_last_strides_2d(sizes())); break; } - case standalone::c10::MemoryFormat::ChannelsLast3d: { + case executorch::backends::aoti::slim::c10::MemoryFormat::ChannelsLast3d: { STANDALONE_CHECK( dim() == 5, "required rank 5 tensor to use channels_last_3d format"); set_sizes_and_strides(sizes(), get_channels_last_strides_3d(sizes())); break; } - case standalone::c10::MemoryFormat::Preserve: + case executorch::backends::aoti::slim::c10::MemoryFormat::Preserve: STANDALONE_CHECK(false, "unsupported memory format ", memory_format); // Cleaning warning messages, no need to break as STANDALONE_CHECK(false) // terminates flow. // break; - case standalone::c10::MemoryFormat::NumOptions: + case executorch::backends::aoti::slim::c10::MemoryFormat::NumOptions: STANDALONE_INTERNAL_ASSERT( false, "invalid memory format ", memory_format); } @@ -125,8 +125,8 @@ inline void _maybe_resize_storage(SlimTensor* self, int64_t new_size_bytes) { inline SlimTensor* _resize_impl_( SlimTensor* self, - standalone::c10::IntArrayRef sizes, - std::optional strides, + executorch::backends::aoti::slim::c10::IntArrayRef sizes, + std::optional strides, bool resize_storage) { if (self->sizes() == sizes && (!strides || self->strides() == strides.value())) { @@ -154,16 +154,17 @@ inline SlimTensor* _resize_impl_( } inline SlimTensor SlimTensor::resize_( - standalone::c10::IntArrayRef sizes, + executorch::backends::aoti::slim::c10::IntArrayRef sizes, std::optional optional_memory_format) { _resize_impl_(this, sizes, /*strides=*/std::nullopt, true); if (optional_memory_format.has_value()) { - standalone::c10::MemoryFormat memory_format = - static_cast( + executorch::backends::aoti::slim::c10::MemoryFormat memory_format = + static_cast( optional_memory_format.value()); STANDALONE_CHECK( - memory_format != standalone::c10::MemoryFormat::Preserve, + memory_format != + executorch::backends::aoti::slim::c10::MemoryFormat::Preserve, "Unsupported memory format", memory_format); this->empty_tensor_restride(memory_format); @@ -171,4 +172,4 @@ inline SlimTensor SlimTensor::resize_( return *this; } -} // namespace standalone::slim +} // namespace executorch::backends::aoti::slim diff --git a/backends/aoti/slim/core/SlimTensorView-incl.h b/backends/aoti/slim/core/SlimTensorView-incl.h index 0df4c4705f1..c247047900c 100644 --- a/backends/aoti/slim/core/SlimTensorView-incl.h +++ b/backends/aoti/slim/core/SlimTensorView-incl.h @@ -6,10 +6,10 @@ #include #include -namespace standalone::slim { +namespace executorch::backends::aoti::slim { inline SlimTensor SlimTensor::as_strided( - standalone::c10::IntArrayRef sizes, - standalone::c10::IntArrayRef strides, + executorch::backends::aoti::slim::c10::IntArrayRef sizes, + executorch::backends::aoti::slim::c10::IntArrayRef strides, int64_t storage_offset) const { SlimTensor result = *this; result.as_strided_(sizes, strides, storage_offset); @@ -17,8 +17,8 @@ inline SlimTensor SlimTensor::as_strided( } inline SlimTensor SlimTensor::as_strided_( - standalone::c10::IntArrayRef sizes, - standalone::c10::IntArrayRef strides, + executorch::backends::aoti::slim::c10::IntArrayRef sizes, + executorch::backends::aoti::slim::c10::IntArrayRef strides, int64_t storage_offset) { STANDALONE_CHECK( sizes.size() == strides.size(), @@ -44,20 +44,22 @@ inline SlimTensor SlimTensor::as_strided_( return *this; } -inline SlimTensor SlimTensor::permute(standalone::c10::IntArrayRef dims) const { +inline SlimTensor SlimTensor::permute( + executorch::backends::aoti::slim::c10::IntArrayRef dims) const { const size_t ndim = this->dim(); STANDALONE_CHECK( ndim == static_cast(dims.size()), "permute: dims length must be equal to tensor.dim()") - standalone::c10::ArrayRef old_sizes = this->sizes(); - standalone::c10::ArrayRef old_strides = this->strides(); + executorch::backends::aoti::slim::c10::ArrayRef old_sizes = this->sizes(); + executorch::backends::aoti::slim::c10::ArrayRef old_strides = this->strides(); std::vector new_sizes = old_sizes.vec(); std::vector new_strides = old_strides.vec(); std::vector seen_dims(ndim, false); for (size_t i = 0; i < ndim; i++) { - int64_t d = standalone::c10::maybe_wrap_dim(dims[i], ndim); + int64_t d = + executorch::backends::aoti::slim::c10::maybe_wrap_dim(dims[i], ndim); STANDALONE_CHECK(!seen_dims[d], "permute: duplicate dims are not allowed"); seen_dims[d] = true; new_sizes[i] = old_sizes[d]; @@ -82,8 +84,8 @@ inline SlimTensor SlimTensor::transpose(int64_t dim0, int64_t dim1) const { } // Wrap dimensions and swap them - dim0 = standalone::c10::maybe_wrap_dim(dim0, ndim); - dim1 = standalone::c10::maybe_wrap_dim(dim1, ndim); + dim0 = executorch::backends::aoti::slim::c10::maybe_wrap_dim(dim0, ndim); + dim1 = executorch::backends::aoti::slim::c10::maybe_wrap_dim(dim1, ndim); std::swap(dims[dim0], dims[dim1]); return permute(dims); @@ -94,7 +96,7 @@ inline SlimTensor SlimTensor::t() const { } inline SlimTensor SlimTensor::reshape( - standalone::c10::IntArrayRef proposed_shape) const { + executorch::backends::aoti::slim::c10::IntArrayRef proposed_shape) const { std::vector final_shape_vec = infer_size(proposed_shape, this->numel()); @@ -124,8 +126,9 @@ inline SlimTensor SlimTensor::narrow(int64_t dim, int64_t start, int64_t length) const { STANDALONE_CHECK( this->dim() > 0, "narrow() cannot be applied to a 0-dim tensor."); - dim = standalone::c10::maybe_wrap_dim(dim, static_cast(this->dim())); - start = standalone::c10::maybe_wrap_dim( + dim = executorch::backends::aoti::slim::c10::maybe_wrap_dim( + dim, static_cast(this->dim())); + start = executorch::backends::aoti::slim::c10::maybe_wrap_dim( start, static_cast(this->size(dim))); STANDALONE_CHECK(length >= 0, "narrow(): length must be non-negative."); @@ -149,4 +152,4 @@ inline SlimTensor SlimTensor::narrow(int64_t dim, int64_t start, int64_t length) return result; } -} // namespace standalone::slim +} // namespace executorch::backends::aoti::slim diff --git a/backends/aoti/slim/core/Storage.h b/backends/aoti/slim/core/Storage.h index 4230a0d2b0a..135b44bca23 100644 --- a/backends/aoti/slim/core/Storage.h +++ b/backends/aoti/slim/core/Storage.h @@ -16,29 +16,35 @@ #include #include -namespace standalone::slim { +namespace executorch::backends::aoti::slim { using DeleterFn = void (*)(void*); namespace detail { inline void noop(void*) {} } // namespace detail -const standalone::c10::Device CPU_DEVICE = - standalone::c10::Device(standalone::c10::DeviceType::CPU, 0); +const executorch::backends::aoti::slim::c10::Device CPU_DEVICE = + executorch::backends::aoti::slim::c10::Device( + executorch::backends::aoti::slim::c10::DeviceType::CPU, + 0); -const standalone::c10::Device DEFAULT_CUDA_DEVICE = - standalone::c10::Device(standalone::c10::DeviceType::CUDA, 0); +const executorch::backends::aoti::slim::c10::Device DEFAULT_CUDA_DEVICE = + executorch::backends::aoti::slim::c10::Device( + executorch::backends::aoti::slim::c10::DeviceType::CUDA, + 0); -// standalone::c10::Device traits template for device-specific operations -template +// executorch::backends::aoti::slim::c10::Device traits template for +// device-specific operations +template struct DeviceTraits; // CPU specialization template <> -struct DeviceTraits { +struct DeviceTraits { static void* allocate( size_t nbytes, - const standalone::c10::Device& device = CPU_DEVICE) { + const executorch::backends::aoti::slim::c10::Device& device = + CPU_DEVICE) { // NOLINTNEXTLINE(cppcoreguidelines-no-malloc) return malloc(nbytes); } @@ -52,8 +58,8 @@ struct DeviceTraits { void* dst, const void* src, size_t nbytes, - const standalone::c10::Device& dst_device, - const standalone::c10::Device& src_device) { + const executorch::backends::aoti::slim::c10::Device& dst_device, + const executorch::backends::aoti::slim::c10::Device& src_device) { std::memcpy(dst, src, nbytes); } }; @@ -61,9 +67,11 @@ struct DeviceTraits { // CUDA specialization #ifdef USE_CUDA template <> -struct DeviceTraits { - static void* allocate(size_t nbytes, const standalone::c10::Device& device) { - standalone::slim::cuda::CUDAGuard guard(device); +struct DeviceTraits { + static void* allocate( + size_t nbytes, + const executorch::backends::aoti::slim::c10::Device& device) { + executorch::backends::aoti::slim::cuda::CUDAGuard guard(device); void* data = nullptr; STANDALONE_CUDA_CHECK(cudaMalloc(&data, nbytes)); return data; @@ -77,11 +85,11 @@ struct DeviceTraits { void* dst, const void* src, size_t nbytes, - const standalone::c10::Device& dst_device, - const standalone::c10::Device& src_device) { + const executorch::backends::aoti::slim::c10::Device& dst_device, + const executorch::backends::aoti::slim::c10::Device& src_device) { // Determine the direction cudaMemcpyKind direction = cudaMemcpyDeviceToDevice; - standalone::c10::Device cuda_device = + executorch::backends::aoti::slim::c10::Device cuda_device = dst_device; // Default to destination device if (src_device.is_cpu()) { @@ -98,14 +106,16 @@ struct DeviceTraits { dst_device.index()); } // Set up CUDA context for the appropriate device - standalone::slim::cuda::CUDAGuard guard(cuda_device); + executorch::backends::aoti::slim::cuda::CUDAGuard guard(cuda_device); STANDALONE_CUDA_CHECK(cudaMemcpy(dst, src, nbytes, direction)); } }; #else template <> -struct DeviceTraits { - static void* allocate(size_t nbytes, const standalone::c10::Device& device) { +struct DeviceTraits { + static void* allocate( + size_t nbytes, + const executorch::backends::aoti::slim::c10::Device& device) { STANDALONE_CHECK(false, "Build with USE_CUDA=1 to enable CUDA support"); } @@ -117,8 +127,8 @@ struct DeviceTraits { void* dst, const void* src, size_t nbytes, - const standalone::c10::Device& dst_device, - const standalone::c10::Device& src_device) { + const executorch::backends::aoti::slim::c10::Device& dst_device, + const executorch::backends::aoti::slim::c10::Device& src_device) { STANDALONE_CHECK(false, "Build with USE_CUDA=1 to enable CUDA support"); } }; @@ -129,24 +139,29 @@ struct DeviceTraits { // non-owning. class MaybeOwningStorage { public: - MaybeOwningStorage(const standalone::c10::Device& device, size_t nbytes) + MaybeOwningStorage( + const executorch::backends::aoti::slim::c10::Device& device, + size_t nbytes) : device_(device), capacity_(nbytes), is_owning_(true) { // Allocating memory here so owning_ has to be true. if (device.is_cpu()) { - data_ = DeviceTraits::allocate( - nbytes, device); - deleter_ = DeviceTraits::free; + data_ = + DeviceTraits:: + allocate(nbytes, device); + deleter_ = DeviceTraits< + executorch::backends::aoti::slim::c10::DeviceType::CPU>::free; } else if (device.is_cuda()) { - data_ = DeviceTraits::allocate( - nbytes, device); - deleter_ = DeviceTraits::free; + data_ = DeviceTraits::allocate(nbytes, device); + deleter_ = DeviceTraits< + executorch::backends::aoti::slim::c10::DeviceType::CUDA>::free; } else { STANDALONE_CHECK(false, "Unsupported device type"); } } MaybeOwningStorage( - const standalone::c10::Device& device, + const executorch::backends::aoti::slim::c10::Device& device, void* data, size_t nbytes) : device_(device), data_(data), capacity_(nbytes), is_owning_(false) { @@ -201,7 +216,7 @@ class MaybeOwningStorage { void* dst_data_ptr, void* src_data_ptr, size_t nbytes, - const standalone::c10::Device& src_device) { + const executorch::backends::aoti::slim::c10::Device& src_device) { STANDALONE_CHECK( dst_data_ptr, "Storage clone failed: dst_data_ptr can not be nullptr") STANDALONE_CHECK( @@ -221,7 +236,8 @@ class MaybeOwningStorage { } } - MaybeOwningStorage clone(const standalone::c10::Device& device) const { + MaybeOwningStorage clone( + const executorch::backends::aoti::slim::c10::Device& device) const { STANDALONE_CHECK( data_, "Storage clone failed: source data can not be nullptr") // Create a new owning storage with the specified device and same capacity @@ -230,12 +246,12 @@ class MaybeOwningStorage { // Copy the data from the current storage to the new storage if (device_.is_cpu() && device.is_cpu()) { // CPU to CPU copy - DeviceTraits::memcpy( - cloned_storage.data_, data_, capacity_, device, device_); + DeviceTraits:: + memcpy(cloned_storage.data_, data_, capacity_, device, device_); } else { // At least one of the devices is CUDA - DeviceTraits::memcpy( - cloned_storage.data_, data_, capacity_, device, device_); + DeviceTraits:: + memcpy(cloned_storage.data_, data_, capacity_, device, device_); } return cloned_storage; @@ -249,7 +265,7 @@ class MaybeOwningStorage { return data_; } - const standalone::c10::Device& device() const { + const executorch::backends::aoti::slim::c10::Device& device() const { return device_; } @@ -286,7 +302,7 @@ class MaybeOwningStorage { } private: - standalone::c10::Device device_ = CPU_DEVICE; + executorch::backends::aoti::slim::c10::Device device_ = CPU_DEVICE; void* data_ = nullptr; size_t capacity_ = 0; DeleterFn deleter_ = detail::noop; @@ -296,12 +312,15 @@ class MaybeOwningStorage { using Storage = SharedPtr; inline Storage new_storage( - standalone::c10::IntArrayRef sizes, - standalone::c10::IntArrayRef strides, - standalone::c10::ScalarType dtype, - const standalone::c10::Device& device = CPU_DEVICE) { + executorch::backends::aoti::slim::c10::IntArrayRef sizes, + executorch::backends::aoti::slim::c10::IntArrayRef strides, + executorch::backends::aoti::slim::c10::ScalarType dtype, + const executorch::backends::aoti::slim::c10::Device& device = CPU_DEVICE) { size_t nbytes = compute_storage_nbytes( - sizes, strides, standalone::c10::elementSize(dtype), 0); + sizes, + strides, + executorch::backends::aoti::slim::c10::elementSize(dtype), + 0); return Storage(new MaybeOwningStorage(device, nbytes)); } -} // namespace standalone::slim +} // namespace executorch::backends::aoti::slim diff --git a/backends/aoti/slim/cuda/Guard.h b/backends/aoti/slim/cuda/Guard.h index c9b2441b148..2fcafce92f9 100644 --- a/backends/aoti/slim/cuda/Guard.h +++ b/backends/aoti/slim/cuda/Guard.h @@ -14,19 +14,20 @@ #include #include -namespace standalone::slim::cuda { +namespace executorch::backends::aoti::slim::cuda { // Thread-local stream management namespace detail { -inline thread_local std:: - unordered_map - current_streams_; +inline thread_local std::unordered_map< + executorch::backends::aoti::slim::c10::DeviceIndex, + cudaStream_t> + current_streams_; } /// Set the current CUDA stream for the specified device inline void setCurrentCUDAStream( cudaStream_t stream, - standalone::c10::DeviceIndex device_index = -1) { + executorch::backends::aoti::slim::c10::DeviceIndex device_index = -1) { if (device_index == -1) { // Get current device if not specified int current_device; @@ -39,7 +40,7 @@ inline void setCurrentCUDAStream( /// Get the current CUDA stream for the specified device inline cudaStream_t getCurrentCUDAStream( - standalone::c10::DeviceIndex device_index = -1) { + executorch::backends::aoti::slim::c10::DeviceIndex device_index = -1) { if (device_index == -1) { // Get current device if not specified int current_device; @@ -64,13 +65,14 @@ struct CUDAGuard { explicit CUDAGuard() = delete; /// Set the current CUDA device to the passed device index. - explicit CUDAGuard(standalone::c10::DeviceIndex device_index) { + explicit CUDAGuard( + executorch::backends::aoti::slim::c10::DeviceIndex device_index) { set_index(device_index); } /// Sets the current CUDA device to the passed device. Errors if the passed /// device is not a CUDA device. - explicit CUDAGuard(standalone::c10::Device device) { + explicit CUDAGuard(executorch::backends::aoti::slim::c10::Device device) { STANDALONE_CHECK( device.is_cuda(), "Expected a CUDA device for CUDAGuard, but got ", @@ -94,7 +96,8 @@ struct CUDAGuard { } /// Sets the CUDA device to the given device index. - void set_index(standalone::c10::DeviceIndex device_index) { + void set_index( + executorch::backends::aoti::slim::c10::DeviceIndex device_index) { int orig_index = -1; STANDALONE_CUDA_CHECK(cudaGetDevice(&orig_index)); @@ -107,8 +110,8 @@ struct CUDAGuard { private: /// The guard for the current device. - standalone::c10::DeviceIndex original_device_index_; - standalone::c10::DeviceIndex current_device_index_; + executorch::backends::aoti::slim::c10::DeviceIndex original_device_index_; + executorch::backends::aoti::slim::c10::DeviceIndex current_device_index_; }; struct CUDAStreamGuard { @@ -118,7 +121,7 @@ struct CUDAStreamGuard { /// Set the current CUDA stream to the passed stream on the specified device. explicit CUDAStreamGuard( cudaStream_t stream, - standalone::c10::DeviceIndex device_index) + executorch::backends::aoti::slim::c10::DeviceIndex device_index) : device_guard_(device_index) { set_stream(stream, device_index); } @@ -140,7 +143,7 @@ struct CUDAStreamGuard { /// Sets the CUDA stream to the given stream on the specified device. void set_stream( cudaStream_t stream, - standalone::c10::DeviceIndex device_index) { + executorch::backends::aoti::slim::c10::DeviceIndex device_index) { // Store the original stream for this device original_stream_ = getCurrentCUDAStream(device_index); current_stream_ = stream; @@ -156,7 +159,7 @@ struct CUDAStreamGuard { } /// Get the device index being guarded - standalone::c10::DeviceIndex device_index() const { + executorch::backends::aoti::slim::c10::DeviceIndex device_index() const { return device_index_; } @@ -168,7 +171,7 @@ struct CUDAStreamGuard { /// The current stream being guarded cudaStream_t current_stream_ = nullptr; /// The device index for this stream guard - standalone::c10::DeviceIndex device_index_; + executorch::backends::aoti::slim::c10::DeviceIndex device_index_; }; -} // namespace standalone::slim::cuda +} // namespace executorch::backends::aoti::slim::cuda diff --git a/backends/aoti/slim/factory/Empty.h b/backends/aoti/slim/factory/Empty.h index bbd4996b84c..20dd89fe1e6 100644 --- a/backends/aoti/slim/factory/Empty.h +++ b/backends/aoti/slim/factory/Empty.h @@ -7,23 +7,23 @@ #include #include -namespace standalone::slim { +namespace executorch::backends::aoti::slim { // The returned SlimTensor owns the underlying storage inline SlimTensor empty_strided( - standalone::c10::IntArrayRef sizes, - standalone::c10::IntArrayRef strides, - standalone::c10::ScalarType dtype, - const standalone::c10::Device& device = CPU_DEVICE) { + executorch::backends::aoti::slim::c10::IntArrayRef sizes, + executorch::backends::aoti::slim::c10::IntArrayRef strides, + executorch::backends::aoti::slim::c10::ScalarType dtype, + const executorch::backends::aoti::slim::c10::Device& device = CPU_DEVICE) { Storage storage = new_storage(sizes, strides, dtype, device); return SlimTensor(std::move(storage), sizes, strides, dtype, 0); } inline SlimTensor empty( - standalone::c10::IntArrayRef sizes, - standalone::c10::ScalarType dtype, - const standalone::c10::Device& device = CPU_DEVICE) { + executorch::backends::aoti::slim::c10::IntArrayRef sizes, + executorch::backends::aoti::slim::c10::ScalarType dtype, + const executorch::backends::aoti::slim::c10::Device& device = CPU_DEVICE) { std::vector contig_strides = - standalone::slim::compute_contiguous_strides(sizes); + executorch::backends::aoti::slim::compute_contiguous_strides(sizes); Storage storage = new_storage(sizes, contig_strides, dtype, device); return SlimTensor(std::move(storage), sizes, contig_strides, dtype, 0); } @@ -32,4 +32,4 @@ inline SlimTensor empty_like(const SlimTensor& other) { return empty_strided( other.sizes(), other.strides(), other.dtype(), other.device()); } -} // namespace standalone::slim +} // namespace executorch::backends::aoti::slim diff --git a/backends/aoti/slim/factory/Factory.h b/backends/aoti/slim/factory/Factory.h index 5e172bc9f6a..f0d26041ad3 100644 --- a/backends/aoti/slim/factory/Factory.h +++ b/backends/aoti/slim/factory/Factory.h @@ -2,13 +2,13 @@ #include -namespace standalone::slim { +namespace executorch::backends::aoti::slim { inline SlimTensor zeros( - standalone::c10::IntArrayRef sizes, - standalone::c10::ScalarType dtype, - const standalone::c10::Device& device = CPU_DEVICE) { + executorch::backends::aoti::slim::c10::IntArrayRef sizes, + executorch::backends::aoti::slim::c10::ScalarType dtype, + const executorch::backends::aoti::slim::c10::Device& device = CPU_DEVICE) { SlimTensor tensor = empty(sizes, dtype, device); - tensor.fill_(standalone::c10::Scalar(0)); + tensor.fill_(executorch::backends::aoti::slim::c10::Scalar(0)); return tensor; } @@ -17,11 +17,11 @@ inline SlimTensor zeros_like(const SlimTensor& other) { } inline SlimTensor ones( - standalone::c10::IntArrayRef sizes, - standalone::c10::ScalarType dtype, - const standalone::c10::Device& device = CPU_DEVICE) { + executorch::backends::aoti::slim::c10::IntArrayRef sizes, + executorch::backends::aoti::slim::c10::ScalarType dtype, + const executorch::backends::aoti::slim::c10::Device& device = CPU_DEVICE) { SlimTensor tensor = empty(sizes, dtype, device); - tensor.fill_(standalone::c10::Scalar(1)); + tensor.fill_(executorch::backends::aoti::slim::c10::Scalar(1)); return tensor; } @@ -29,4 +29,4 @@ inline SlimTensor ones_like(const SlimTensor& other) { return ones(other.sizes(), other.dtype(), other.device()); } -} // namespace standalone::slim +} // namespace executorch::backends::aoti::slim diff --git a/backends/aoti/slim/factory/FromBlob.h b/backends/aoti/slim/factory/FromBlob.h index d1877f7f31d..c7a558f72ed 100644 --- a/backends/aoti/slim/factory/FromBlob.h +++ b/backends/aoti/slim/factory/FromBlob.h @@ -2,15 +2,15 @@ #include -namespace standalone::slim { +namespace executorch::backends::aoti::slim { // The returned SlimTensor does not own the underlying storage inline SlimTensor from_blob( void* data, - standalone::c10::IntArrayRef sizes, - standalone::c10::IntArrayRef strides, - standalone::c10::ScalarType dtype, - const standalone::c10::Device& device = CPU_DEVICE, + executorch::backends::aoti::slim::c10::IntArrayRef sizes, + executorch::backends::aoti::slim::c10::IntArrayRef strides, + executorch::backends::aoti::slim::c10::ScalarType dtype, + const executorch::backends::aoti::slim::c10::Device& device = CPU_DEVICE, int64_t storage_offset = 0) { STANDALONE_CHECK(data != nullptr, "data pointer can not be nullptr"); @@ -24,13 +24,13 @@ inline SlimTensor from_blob( inline SlimTensor from_blob( void* data, - standalone::c10::IntArrayRef sizes, - standalone::c10::ScalarType dtype, - const standalone::c10::Device& device = CPU_DEVICE, + executorch::backends::aoti::slim::c10::IntArrayRef sizes, + executorch::backends::aoti::slim::c10::ScalarType dtype, + const executorch::backends::aoti::slim::c10::Device& device = CPU_DEVICE, int64_t storage_offset = 0) { std::vector contig_strides = - standalone::slim::compute_contiguous_strides(sizes); + executorch::backends::aoti::slim::compute_contiguous_strides(sizes); return from_blob(data, sizes, contig_strides, dtype, device, storage_offset); } -} // namespace standalone::slim +} // namespace executorch::backends::aoti::slim diff --git a/backends/aoti/slim/factory/FromScalar.h b/backends/aoti/slim/factory/FromScalar.h index 223f734d940..df01121a6f7 100644 --- a/backends/aoti/slim/factory/FromScalar.h +++ b/backends/aoti/slim/factory/FromScalar.h @@ -2,14 +2,14 @@ #include -namespace standalone::slim { +namespace executorch::backends::aoti::slim { inline SlimTensor scalar_to_tensor( - const standalone::c10::Scalar& s, - const standalone::c10::Device& device = CPU_DEVICE) { + const executorch::backends::aoti::slim::c10::Scalar& s, + const executorch::backends::aoti::slim::c10::Device& device = CPU_DEVICE) { SlimTensor result = empty_strided({}, {}, s.type(), device); result.fill_(s); return result; } -} // namespace standalone::slim +} // namespace executorch::backends::aoti::slim diff --git a/backends/aoti/slim/factory/Pad.h b/backends/aoti/slim/factory/Pad.h index 4d7fef731bd..44a83696a14 100644 --- a/backends/aoti/slim/factory/Pad.h +++ b/backends/aoti/slim/factory/Pad.h @@ -2,15 +2,15 @@ #include -namespace standalone::slim { +namespace executorch::backends::aoti::slim { inline SlimTensor constant_pad_nd( const SlimTensor& self, - standalone::c10::IntArrayRef pad, - const standalone::c10::Scalar& value) { + executorch::backends::aoti::slim::c10::IntArrayRef pad, + const executorch::backends::aoti::slim::c10::Scalar& value) { STANDALONE_CHECK(pad.size() % 2 == 0, "Length of pad must be even"); - standalone::c10::IntArrayRef input_sizes = self.sizes(); + executorch::backends::aoti::slim::c10::IntArrayRef input_sizes = self.sizes(); int64_t l_inp = self.dim(); int64_t l_pad = static_cast(pad.size()) / 2; int64_t l_diff = l_inp - l_pad; @@ -50,7 +50,8 @@ inline SlimTensor constant_pad_nd( new_shape.emplace_back(input_sizes[i]); } - for (const auto i : standalone::c10::irange((size_t)l_pad)) { + for (const auto i : + executorch::backends::aoti::slim::c10::irange((size_t)l_pad)) { auto pad_idx = pad.size() - ((i + 1) * 2); auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1]; STANDALONE_CHECK( @@ -73,7 +74,8 @@ inline SlimTensor constant_pad_nd( // create a view into the center of the output tensor SlimTensor c_output = output; - for (const auto i : standalone::c10::irange(l_diff, l_inp)) { + for (const auto i : + executorch::backends::aoti::slim::c10::irange(l_diff, l_inp)) { auto pad_idx = 2 * (l_inp - i - 1); if (pad[pad_idx] > 0) { c_output = @@ -90,7 +92,7 @@ inline SlimTensor constant_pad_nd( inline SlimTensor pad( const SlimTensor& self, - standalone::c10::IntArrayRef pad, + executorch::backends::aoti::slim::c10::IntArrayRef pad, std::string_view mode, std::optional value) { if (mode == "constant") { @@ -103,4 +105,4 @@ inline SlimTensor pad( ". Only constant mode is available."); } -} // namespace standalone::slim +} // namespace executorch::backends::aoti::slim diff --git a/backends/aoti/slim/tests/test_slim_tensor_basic.cpp b/backends/aoti/slim/tests/test_slim_tensor_basic.cpp index 37b6ccb240d..e37f252c740 100644 --- a/backends/aoti/slim/tests/test_slim_tensor_basic.cpp +++ b/backends/aoti/slim/tests/test_slim_tensor_basic.cpp @@ -14,31 +14,39 @@ #include #include -namespace standalone::slim { +namespace executorch::backends::aoti::slim { namespace { TEST(SlimTensorBasicTest, EmptyTensorCreation) { - auto tensor = - empty({2, 3, 4}, standalone::c10::ScalarType::Float, CPU_DEVICE); + auto tensor = empty( + {2, 3, 4}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + CPU_DEVICE); EXPECT_EQ(tensor.dim(), 3); EXPECT_EQ(tensor.size(0), 2); EXPECT_EQ(tensor.size(1), 3); EXPECT_EQ(tensor.size(2), 4); EXPECT_EQ(tensor.numel(), 24); - EXPECT_EQ(tensor.dtype(), standalone::c10::ScalarType::Float); + EXPECT_EQ( + tensor.dtype(), executorch::backends::aoti::slim::c10::ScalarType::Float); EXPECT_TRUE(tensor.is_contiguous()); } TEST(SlimTensorBasicTest, EmptyTensorContiguousStrides) { - auto tensor = - empty({2, 3, 4}, standalone::c10::ScalarType::Float, CPU_DEVICE); + auto tensor = empty( + {2, 3, 4}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + CPU_DEVICE); EXPECT_EQ(tensor.stride(0), 12); EXPECT_EQ(tensor.stride(1), 4); EXPECT_EQ(tensor.stride(2), 1); } TEST(SlimTensorBasicTest, ZerosTensorCreation) { - auto tensor = zeros({3, 3}, standalone::c10::ScalarType::Float, CPU_DEVICE); + auto tensor = zeros( + {3, 3}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + CPU_DEVICE); EXPECT_EQ(tensor.numel(), 9); float* data = static_cast(tensor.data_ptr()); for (int i = 0; i < 9; ++i) { @@ -47,7 +55,10 @@ TEST(SlimTensorBasicTest, ZerosTensorCreation) { } TEST(SlimTensorBasicTest, OnesTensorCreation) { - auto tensor = ones({2, 2}, standalone::c10::ScalarType::Float, CPU_DEVICE); + auto tensor = ones( + {2, 2}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + CPU_DEVICE); EXPECT_EQ(tensor.numel(), 4); float* data = static_cast(tensor.data_ptr()); for (int i = 0; i < 4; ++i) { @@ -56,7 +67,10 @@ TEST(SlimTensorBasicTest, OnesTensorCreation) { } TEST(SlimTensorBasicTest, FillTensor) { - auto tensor = empty({2, 3}, standalone::c10::ScalarType::Float, CPU_DEVICE); + auto tensor = empty( + {2, 3}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + CPU_DEVICE); tensor.fill_(5.0f); float* data = static_cast(tensor.data_ptr()); for (int i = 0; i < 6; ++i) { @@ -67,7 +81,10 @@ TEST(SlimTensorBasicTest, FillTensor) { TEST(SlimTensorBasicTest, FromBlobNonOwning) { std::vector data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; auto tensor = from_blob( - data.data(), {2, 3}, standalone::c10::ScalarType::Float, CPU_DEVICE); + data.data(), + {2, 3}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + CPU_DEVICE); EXPECT_EQ(tensor.dim(), 2); EXPECT_EQ(tensor.size(0), 2); EXPECT_EQ(tensor.size(1), 3); @@ -76,7 +93,10 @@ TEST(SlimTensorBasicTest, FromBlobNonOwning) { } TEST(SlimTensorBasicTest, Clone) { - auto tensor = empty({2, 3}, standalone::c10::ScalarType::Float, CPU_DEVICE); + auto tensor = empty( + {2, 3}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + CPU_DEVICE); tensor.fill_(3.14f); auto cloned = tensor.clone(); @@ -91,10 +111,16 @@ TEST(SlimTensorBasicTest, Clone) { } TEST(SlimTensorBasicTest, CopyFrom) { - auto src = empty({2, 3}, standalone::c10::ScalarType::Float, CPU_DEVICE); + auto src = empty( + {2, 3}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + CPU_DEVICE); src.fill_(2.5f); - auto dst = empty({2, 3}, standalone::c10::ScalarType::Float, CPU_DEVICE); + auto dst = empty( + {2, 3}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + CPU_DEVICE); dst.copy_(src); float* dst_data = static_cast(dst.data_ptr()); @@ -104,7 +130,10 @@ TEST(SlimTensorBasicTest, CopyFrom) { } TEST(SlimTensorBasicTest, Reshape) { - auto tensor = empty({2, 6}, standalone::c10::ScalarType::Float, CPU_DEVICE); + auto tensor = empty( + {2, 6}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + CPU_DEVICE); tensor.fill_(1.0f); auto reshaped = tensor.reshape({3, 4}); @@ -115,15 +144,20 @@ TEST(SlimTensorBasicTest, Reshape) { } TEST(SlimTensorBasicTest, Transpose) { - auto tensor = empty({2, 3}, standalone::c10::ScalarType::Float, CPU_DEVICE); + auto tensor = empty( + {2, 3}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + CPU_DEVICE); auto transposed = tensor.transpose(0, 1); EXPECT_EQ(transposed.size(0), 3); EXPECT_EQ(transposed.size(1), 2); } TEST(SlimTensorBasicTest, Permute) { - auto tensor = - empty({2, 3, 4}, standalone::c10::ScalarType::Float, CPU_DEVICE); + auto tensor = empty( + {2, 3, 4}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + CPU_DEVICE); auto permuted = tensor.permute({2, 0, 1}); EXPECT_EQ(permuted.size(0), 4); EXPECT_EQ(permuted.size(1), 2); @@ -131,7 +165,10 @@ TEST(SlimTensorBasicTest, Permute) { } TEST(SlimTensorBasicTest, Narrow) { - auto tensor = empty({10}, standalone::c10::ScalarType::Float, CPU_DEVICE); + auto tensor = empty( + {10}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + CPU_DEVICE); for (int i = 0; i < 10; ++i) { static_cast(tensor.data_ptr())[i] = static_cast(i); } @@ -147,8 +184,10 @@ TEST(SlimTensorBasicTest, Narrow) { } TEST(SlimTensorBasicTest, EmptyLike) { - auto tensor = - empty({2, 3, 4}, standalone::c10::ScalarType::Float, CPU_DEVICE); + auto tensor = empty( + {2, 3, 4}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + CPU_DEVICE); auto empty_like_tensor = empty_like(tensor); EXPECT_EQ(empty_like_tensor.sizes(), tensor.sizes()); EXPECT_EQ(empty_like_tensor.dtype(), tensor.dtype()); @@ -156,7 +195,10 @@ TEST(SlimTensorBasicTest, EmptyLike) { } TEST(SlimTensorBasicTest, ZerosLike) { - auto tensor = empty({2, 3}, standalone::c10::ScalarType::Float, CPU_DEVICE); + auto tensor = empty( + {2, 3}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + CPU_DEVICE); auto zeros_tensor = zeros_like(tensor); EXPECT_EQ(zeros_tensor.sizes(), tensor.sizes()); @@ -167,4 +209,4 @@ TEST(SlimTensorBasicTest, ZerosLike) { } } // namespace -} // namespace standalone::slim +} // namespace executorch::backends::aoti::slim diff --git a/backends/aoti/slim/tests/test_slim_tensor_cuda.cpp b/backends/aoti/slim/tests/test_slim_tensor_cuda.cpp index 571d4f99893..2bca695fa15 100644 --- a/backends/aoti/slim/tests/test_slim_tensor_cuda.cpp +++ b/backends/aoti/slim/tests/test_slim_tensor_cuda.cpp @@ -15,7 +15,7 @@ #include #include -namespace standalone::slim { +namespace executorch::backends::aoti::slim { namespace { class SlimTensorCUDATest : public ::testing::Test { @@ -30,22 +30,30 @@ class SlimTensorCUDATest : public ::testing::Test { }; TEST_F(SlimTensorCUDATest, EmptyCUDATensorCreation) { - auto tensor = - empty({2, 3, 4}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + auto tensor = empty( + {2, 3, 4}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); EXPECT_EQ(tensor.dim(), 3); EXPECT_EQ(tensor.size(0), 2); EXPECT_EQ(tensor.size(1), 3); EXPECT_EQ(tensor.size(2), 4); EXPECT_EQ(tensor.numel(), 24); - EXPECT_EQ(tensor.device().type(), standalone::c10::DeviceType::CUDA); + EXPECT_EQ( + tensor.device().type(), + executorch::backends::aoti::slim::c10::DeviceType::CUDA); EXPECT_TRUE(tensor.is_contiguous()); } TEST_F(SlimTensorCUDATest, ZerosCUDATensor) { - auto tensor = - zeros({3, 3}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + auto tensor = zeros( + {3, 3}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); EXPECT_EQ(tensor.numel(), 9); - EXPECT_EQ(tensor.device().type(), standalone::c10::DeviceType::CUDA); + EXPECT_EQ( + tensor.device().type(), + executorch::backends::aoti::slim::c10::DeviceType::CUDA); std::vector host_data(9); cudaMemcpy( @@ -60,8 +68,10 @@ TEST_F(SlimTensorCUDATest, ZerosCUDATensor) { } TEST_F(SlimTensorCUDATest, OnesCUDATensor) { - auto tensor = - ones({2, 2}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + auto tensor = ones( + {2, 2}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); EXPECT_EQ(tensor.numel(), 4); std::vector host_data(4); @@ -77,8 +87,10 @@ TEST_F(SlimTensorCUDATest, OnesCUDATensor) { } TEST_F(SlimTensorCUDATest, FillCUDATensor) { - auto tensor = - empty({2, 3}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + auto tensor = empty( + {2, 3}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); tensor.fill_(5.0f); std::vector host_data(6); @@ -94,8 +106,10 @@ TEST_F(SlimTensorCUDATest, FillCUDATensor) { } TEST_F(SlimTensorCUDATest, CloneCUDATensor) { - auto tensor = - empty({2, 3}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + auto tensor = empty( + {2, 3}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); tensor.fill_(3.14f); auto cloned = tensor.clone(); @@ -116,12 +130,16 @@ TEST_F(SlimTensorCUDATest, CloneCUDATensor) { } TEST_F(SlimTensorCUDATest, CopyCUDAToCUDA) { - auto src = - empty({2, 3}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + auto src = empty( + {2, 3}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); src.fill_(2.5f); - auto dst = - empty({2, 3}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + auto dst = empty( + {2, 3}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); dst.copy_(src); std::vector host_data(6); @@ -137,12 +155,16 @@ TEST_F(SlimTensorCUDATest, CopyCUDAToCUDA) { } TEST_F(SlimTensorCUDATest, CopyCPUToCUDA) { - auto cpu_tensor = - empty({2, 3}, standalone::c10::ScalarType::Float, CPU_DEVICE); + auto cpu_tensor = empty( + {2, 3}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + CPU_DEVICE); cpu_tensor.fill_(1.5f); - auto cuda_tensor = - empty({2, 3}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + auto cuda_tensor = empty( + {2, 3}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); cuda_tensor.copy_(cpu_tensor); std::vector host_data(6); @@ -158,12 +180,16 @@ TEST_F(SlimTensorCUDATest, CopyCPUToCUDA) { } TEST_F(SlimTensorCUDATest, CopyCUDAToCPU) { - auto cuda_tensor = - empty({2, 3}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + auto cuda_tensor = empty( + {2, 3}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); cuda_tensor.fill_(4.5f); - auto cpu_tensor = - empty({2, 3}, standalone::c10::ScalarType::Float, CPU_DEVICE); + auto cpu_tensor = empty( + {2, 3}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + CPU_DEVICE); cpu_tensor.copy_(cuda_tensor); float* data = static_cast(cpu_tensor.data_ptr()); @@ -174,14 +200,20 @@ TEST_F(SlimTensorCUDATest, CopyCUDAToCPU) { TEST_F(SlimTensorCUDATest, CUDAGuard) { cuda::CUDAGuard guard(0); - auto tensor = - empty({2, 3}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); - EXPECT_EQ(tensor.device().type(), standalone::c10::DeviceType::CUDA); + auto tensor = empty( + {2, 3}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); + EXPECT_EQ( + tensor.device().type(), + executorch::backends::aoti::slim::c10::DeviceType::CUDA); } TEST_F(SlimTensorCUDATest, ReshapeCUDATensor) { - auto tensor = - empty({2, 6}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + auto tensor = empty( + {2, 6}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); auto reshaped = tensor.reshape({3, 4}); EXPECT_EQ(reshaped.dim(), 2); EXPECT_EQ(reshaped.size(0), 3); @@ -190,8 +222,10 @@ TEST_F(SlimTensorCUDATest, ReshapeCUDATensor) { } TEST_F(SlimTensorCUDATest, TransposeCUDATensor) { - auto tensor = - empty({2, 3}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + auto tensor = empty( + {2, 3}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); auto transposed = tensor.transpose(0, 1); EXPECT_EQ(transposed.size(0), 3); EXPECT_EQ(transposed.size(1), 2); @@ -199,8 +233,10 @@ TEST_F(SlimTensorCUDATest, TransposeCUDATensor) { } TEST_F(SlimTensorCUDATest, PermuteCUDATensor) { - auto tensor = - empty({2, 3, 4}, standalone::c10::ScalarType::Float, DEFAULT_CUDA_DEVICE); + auto tensor = empty( + {2, 3, 4}, + executorch::backends::aoti::slim::c10::ScalarType::Float, + DEFAULT_CUDA_DEVICE); auto permuted = tensor.permute({2, 0, 1}); EXPECT_EQ(permuted.size(0), 4); EXPECT_EQ(permuted.size(1), 2); @@ -209,4 +245,4 @@ TEST_F(SlimTensorCUDATest, PermuteCUDATensor) { } } // namespace -} // namespace standalone::slim +} // namespace executorch::backends::aoti::slim diff --git a/backends/aoti/slim/util/SharedPtr.h b/backends/aoti/slim/util/SharedPtr.h index 9ad565d9ab9..33a4def5845 100644 --- a/backends/aoti/slim/util/SharedPtr.h +++ b/backends/aoti/slim/util/SharedPtr.h @@ -7,7 +7,7 @@ #include -namespace standalone::slim { +namespace executorch::backends::aoti::slim { /** * NonAtomicSharedPtr - A lightweight, non-thread-safe shared pointer @@ -210,7 +210,7 @@ std::shared_ptr make_shared(Args&&... args) { #else template -using SharedPtr = ::standalone::slim::NonAtomicSharedPtr; +using SharedPtr = ::executorch::backends::aoti::slim::NonAtomicSharedPtr; // make_shared for NonAtomicSharedPtr template @@ -219,4 +219,4 @@ NonAtomicSharedPtr make_shared(Args&&... args) { } #endif // USE_MULTI_THREAD -} // namespace standalone::slim +} // namespace executorch::backends::aoti::slim diff --git a/backends/aoti/slim/util/SizeUtil.h b/backends/aoti/slim/util/SizeUtil.h index d22416cd176..4eab9fc2329 100644 --- a/backends/aoti/slim/util/SizeUtil.h +++ b/backends/aoti/slim/util/SizeUtil.h @@ -11,7 +11,7 @@ #include #include -namespace standalone::slim { +namespace executorch::backends::aoti::slim { #ifndef STANDALONE_MOBILE inline constexpr uint64_t storage_max() { // int64_t and size_t are used somewhat inconsistently throughout ATen. @@ -28,9 +28,11 @@ inline constexpr uint64_t storage_max() { * tensor. Catches integer overflow that may occur when a tensor * using a sparse layout has multiple dimensions with large sizes. */ -inline int64_t safe_compute_numel(standalone::c10::IntArrayRef sizes) { +inline int64_t safe_compute_numel( + executorch::backends::aoti::slim::c10::IntArrayRef sizes) { uint64_t n = 1; - bool overflowed = standalone::c10::safe_multiplies_u64(sizes, &n); + bool overflowed = + executorch::backends::aoti::slim::c10::safe_multiplies_u64(sizes, &n); overflowed |= (n > storage_max()); STANDALONE_CHECK(!overflowed, "numel: integer multiplication overflow"); return static_cast(n); @@ -59,26 +61,30 @@ inline std::vector safe_compute_contiguous_strides( } #endif // STANDALONE_MOBILE -inline int64_t compute_numel(standalone::c10::IntArrayRef sizes) { +inline int64_t compute_numel( + executorch::backends::aoti::slim::c10::IntArrayRef sizes) { #ifndef STANDALONE_MOBILE // Use overflow checks if supported by the compiler return safe_compute_numel(sizes); #else - return standalone::c10::multiply_integers(sizes); + return executorch::backends::aoti::slim::c10::multiply_integers(sizes); #endif } // named computeStorageNbytesContiguous in c10 inline size_t compute_storage_nbytes_contiguous( - standalone::c10::IntArrayRef sizes, + executorch::backends::aoti::slim::c10::IntArrayRef sizes, size_t itemsize_bytes, size_t storage_offset) { // Ignore overflow checks on mobile #ifndef STANDALONE_MOBILE uint64_t size = 1; - bool overflowed = standalone::c10::safe_multiplies_u64(sizes, &size); - overflowed |= standalone::c10::add_overflows(size, storage_offset, &size); - overflowed |= standalone::c10::mul_overflows(size, itemsize_bytes, &size); + bool overflowed = + executorch::backends::aoti::slim::c10::safe_multiplies_u64(sizes, &size); + overflowed |= executorch::backends::aoti::slim::c10::add_overflows( + size, storage_offset, &size); + overflowed |= executorch::backends::aoti::slim::c10::mul_overflows( + size, itemsize_bytes, &size); overflowed |= size > storage_max(); STANDALONE_CHECK( !overflowed, "Storage size calculation overflowed with sizes=", sizes); @@ -91,8 +97,8 @@ inline size_t compute_storage_nbytes_contiguous( // named computeStorageNbytes in c10 inline size_t compute_storage_nbytes( - standalone::c10::IntArrayRef sizes, - standalone::c10::IntArrayRef strides, + executorch::backends::aoti::slim::c10::IntArrayRef sizes, + executorch::backends::aoti::slim::c10::IntArrayRef strides, size_t itemsize_bytes, size_t storage_offset) { STANDALONE_CHECK( @@ -109,17 +115,20 @@ inline size_t compute_storage_nbytes( // of the last element according to stride uint64_t size = storage_offset + 1; bool overflowed = false; - for (const auto i : standalone::c10::irange(sizes.size())) { + for (const auto i : + executorch::backends::aoti::slim::c10::irange(sizes.size())) { if (sizes[i] == 0) { return 0; } uint64_t strided_size = 0; - overflowed |= - standalone::c10::mul_overflows(strides[i], sizes[i] - 1, &strided_size); - overflowed |= standalone::c10::add_overflows(size, strided_size, &size); + overflowed |= executorch::backends::aoti::slim::c10::mul_overflows( + strides[i], sizes[i] - 1, &strided_size); + overflowed |= executorch::backends::aoti::slim::c10::add_overflows( + size, strided_size, &size); } - overflowed |= standalone::c10::mul_overflows(size, itemsize_bytes, &size); + overflowed |= executorch::backends::aoti::slim::c10::mul_overflows( + size, itemsize_bytes, &size); overflowed |= size > storage_max(); STANDALONE_CHECK( !overflowed, @@ -132,7 +141,8 @@ inline size_t compute_storage_nbytes( // size of the underlying storage is 1 bigger than the offset // of the last element according to stride uint64_t size = 1; - for (const auto i : standalone::c10::irange(sizes.size())) { + for (const auto i : + executorch::backends::aoti::slim::c10::irange(sizes.size())) { if (sizes[i] == 0) { return 0; } @@ -165,7 +175,7 @@ inline std::vector compute_contiguous_strides(c10::IntArrayRef sizes) { // calculates the final concrete shape by also filling in at most one '-1' // dimension. inline std::vector infer_size( - standalone::c10::IntArrayRef shape, + executorch::backends::aoti::slim::c10::IntArrayRef shape, int64_t numel) { int64_t new_size = 1; std::optional infer_dim; @@ -182,8 +192,8 @@ inline std::vector infer_size( result_shape.push_back(-1); // placeholder } else { STANDALONE_CHECK(shape[dim] >= 0, "invalid shape dimension ", shape[dim]); - overflowed |= - standalone::c10::mul_overflows(new_size, shape[dim], &new_size); + overflowed |= executorch::backends::aoti::slim::c10::mul_overflows( + new_size, shape[dim], &new_size); result_shape.push_back(shape[dim]); } } @@ -207,9 +217,9 @@ inline std::vector infer_size( // If so, it returns the new strides // If not, it returns an empty optional inline std::optional> compute_stride( - standalone::c10::IntArrayRef old_sizes, - standalone::c10::IntArrayRef old_strides, - standalone::c10::IntArrayRef new_sizes) { + executorch::backends::aoti::slim::c10::IntArrayRef old_sizes, + executorch::backends::aoti::slim::c10::IntArrayRef old_strides, + executorch::backends::aoti::slim::c10::IntArrayRef new_sizes) { if (old_sizes.empty()) { return std::vector(new_sizes.size(), 1); } @@ -248,7 +258,7 @@ inline std::optional> compute_stride( tensor_d--) { // TODO: ask if this could lead to overflow by any chance? // even if so, overflow is not handled in the aten implementation - overflowed |= standalone::c10::mul_overflows( + overflowed |= executorch::backends::aoti::slim::c10::mul_overflows( tensor_numel, old_sizes[tensor_d], &tensor_numel); bool is_chunk_end = (tensor_d == 0) || @@ -280,4 +290,4 @@ inline std::optional> compute_stride( return new_strides; } -} // namespace standalone::slim +} // namespace executorch::backends::aoti::slim