diff --git a/backends/aoti/slim/c10/macros/Macros.h b/backends/aoti/slim/c10/macros/Macros.h index aa8329263fe..d05f177ccd8 100644 --- a/backends/aoti/slim/c10/macros/Macros.h +++ b/backends/aoti/slim/c10/macros/Macros.h @@ -1,219 +1,61 @@ #pragma once -#include - -// UBSan (Undefined Behavior Sanitizer) macros -#if defined(__clang__) -#define __ubsan_ignore_float_divide_by_zero__ \ - __attribute__((no_sanitize("float-divide-by-zero"))) -#define __ubsan_ignore_undefined__ __attribute__((no_sanitize("undefined"))) -#define __ubsan_ignore_signed_int_overflow__ \ - __attribute__((no_sanitize("signed-integer-overflow"))) -#define __ubsan_ignore_pointer_overflow__ \ - __attribute__((no_sanitize("pointer-overflow"))) -#define __ubsan_ignore_function__ __attribute__((no_sanitize("function"))) -#define __ubsan_ignore_float_cast_overflow__ \ - __attribute__((no_sanitize("float-cast-overflow"))) -#else -#define __ubsan_ignore_float_divide_by_zero__ -#define __ubsan_ignore_undefined__ -#define __ubsan_ignore_signed_int_overflow__ -#define __ubsan_ignore_pointer_overflow__ -#define __ubsan_ignore_function__ -#define __ubsan_ignore_float_cast_overflow__ -#endif - -// STANDALONE_LIKELY/STANDALONE_UNLIKELY -// -// These macros provide parentheses, so you can use these macros as: -// -// if STANDALONE_LIKELY(some_expr) { -// ... -// } -// -// NB: static_cast to boolean is mandatory in C++, because __builtin_expect -// takes a long argument, which means you may trigger the wrong conversion -// without it. -// -#if defined(__GNUC__) || defined(__ICL) || defined(__clang__) -#define STANDALONE_LIKELY(expr) (__builtin_expect(static_cast(expr), 1)) -#define STANDALONE_UNLIKELY(expr) (__builtin_expect(static_cast(expr), 0)) -#else -#define STANDALONE_LIKELY(expr) (expr) -#define STANDALONE_UNLIKELY(expr) (expr) -#endif - -// On nvcc, STANDALONE_UNLIKELY thwarts missing return statement analysis. In -// cases where the unlikely expression may be a constant, use this macro to -// ensure return statement analysis keeps working (at the cost of not getting -// the likely/unlikely annotation on nvcc). -// https://github.com/pytorch/pytorch/issues/21418 +// SlimTensor Macros Header // -// Currently, this is only used in the error reporting macros below. If you -// want to use it more generally, move me to Macros.h -// -// TODO: Brian Vaughan observed that we might be able to get this to work on -// nvcc by writing some sort of C++ overload that distinguishes constexpr inputs -// from non-constexpr. Since there isn't any evidence that losing -// STANDALONE_UNLIKELY in nvcc is causing us perf problems, this is not yet -// implemented, but this might be an interesting piece of C++ code for an -// intrepid bootcamper to write. -#if defined(__CUDACC__) -#define STANDALONE_UNLIKELY_OR_CONST(e) e -#else -#define STANDALONE_UNLIKELY_OR_CONST(e) STANDALONE_UNLIKELY(e) -#endif - -#define STANDALONE_STRINGIZE_IMPL(x) #x -#define STANDALONE_STRINGIZE(x) STANDALONE_STRINGIZE_IMPL(x) - -#define STANDALONE_CONCATENATE_IMPL(s1, s2) s1##s2 -#define STANDALONE_CONCATENATE(s1, s2) STANDALONE_CONCATENATE_IMPL(s1, s2) - -/** - * STANDALONE_ANONYMOUS_VARIABLE(str) introduces a new identifier which starts - * with str and ends with a unique number. - */ -#ifdef __COUNTER__ -#define STANDALONE_UID __COUNTER__ -#define STANDALONE_ANONYMOUS_VARIABLE(str) \ - STANDALONE_CONCATENATE(str, __COUNTER__) -#else -#define STANDALONE_UID __LINE__ -#define STANDALONE_ANONYMOUS_VARIABLE(str) STANDALONE_CONCATENATE(str, __LINE__) -#endif - -// Private helper macro for workaround MSVC misexpansion of nested macro -// invocations involving __VA_ARGS__. See -// https://stackoverflow.com/questions/5134523/msvc-doesnt-expand-va-args-correctly -#define STANDALONE_EXPAND_MSVC_WORKAROUND(x) x - -/// STANDALONE_NOINLINE - Functions whose declaration is annotated with this -/// will not be inlined. -#ifdef __GNUC__ -#define STANDALONE_NOINLINE __attribute__((noinline)) -#elif _MSC_VER -#define STANDALONE_NOINLINE __declspec(noinline) -#else -#define STANDALONE_NOINLINE -#endif - -#if defined(_MSC_VER) -#define STANDALONE_ALWAYS_INLINE __forceinline -#elif __has_attribute(always_inline) || defined(__GNUC__) -#define STANDALONE_ALWAYS_INLINE __attribute__((__always_inline__)) inline -#else -#define STANDALONE_ALWAYS_INLINE inline -#endif - -// Unlike STANDALONE_ALWAYS_INLINE, STANDALONE_ALWAYS_INLINE_ATTRIBUTE can be -// used on a lambda. -#if defined(_MSC_VER) -// MSVC 14.39 is reasonably recent and doesn't like -// [[msvc::forceinline]] on a lambda, so don't try to use it. -#define STANDALONE_ALWAYS_INLINE_ATTRIBUTE -#elif __has_attribute(always_inline) || defined(__GNUC__) -#define STANDALONE_ALWAYS_INLINE_ATTRIBUTE __attribute__((__always_inline__)) -#else -#define STANDALONE_ALWAYS_INLINE_ATTRIBUTE -#endif - -#if defined(_MSC_VER) -#define STANDALONE_ATTR_VISIBILITY_HIDDEN -#elif defined(__GNUC__) -#define STANDALONE_ATTR_VISIBILITY_HIDDEN \ - __attribute__((__visibility__("hidden"))) -#else -#define STANDALONE_ATTR_VISIBILITY_HIDDEN -#endif - -#define STANDALONE_ERASE \ - STANDALONE_ALWAYS_INLINE STANDALONE_ATTR_VISIBILITY_HIDDEN - -#include - -#ifdef __HIPCC__ -// Unlike CUDA, HIP requires a HIP header to be included for __host__ to work. -// We do this #include here so that STANDALONE_HOST_DEVICE and friends will Just -// Work. See https://github.com/ROCm/hip/issues/441 -#include -#endif - -#if defined(__CUDACC__) || defined(__HIPCC__) -// Designates functions callable from the host (CPU) and the device (GPU) -#define STANDALONE_HOST_DEVICE __host__ __device__ -#define STANDALONE_DEVICE __device__ -#define STANDALONE_HOST __host__ -// constants from -// (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications) -// The maximum number of threads per multiprocessor is 1024 for Turing -// architecture (7.5), 1536 for Geforce Ampere (8.6)/Jetson Orin (8.7), and -// 2048 for all other architectures. You'll get warnings if you exceed these -// constants. Hence, the following macros adjust the input values from the user -// to resolve potential warnings. -#if __CUDA_ARCH__ == 750 -constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024; -#elif __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890 -constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1536; -#else -constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048; -#endif -// CUDA_MAX_THREADS_PER_BLOCK is same for all architectures currently -constexpr uint32_t CUDA_MAX_THREADS_PER_BLOCK = 1024; -// CUDA_THREADS_PER_BLOCK_FALLBACK is the "canonical fallback" choice of block -// size. 256 is a good number for this fallback and should give good occupancy -// and versatility across all architectures. -constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256; -// NOTE: if you are thinking of constexpr-ify the inputs to launch bounds, it -// turns out that although __launch_bounds__ can take constexpr, it -// can't take a constexpr that has anything to do with templates. -// Currently we use launch_bounds that depend on template arguments in -// Loops.cuh, Reduce.cuh and LossCTC.cuh. Hence, -// STANDALONE_MAX_THREADS_PER_BLOCK and STANDALONE_MIN_BLOCKS_PER_SM are -// kept as macros. -// Suppose you were planning to write __launch_bounds__(a, b), based on your -// performance tuning on a modern GPU. Instead, you should write -// __launch_bounds__(STANDALONE_MAX_THREADS_PER_BLOCK(a), -// STANDALONE_MIN_BLOCKS_PER_SM(a, b)), which will also properly respect limits -// on old architectures. -#define STANDALONE_MAX_THREADS_PER_BLOCK(val) \ - (((val) <= CUDA_MAX_THREADS_PER_BLOCK) ? (val) \ - : CUDA_THREADS_PER_BLOCK_FALLBACK) -#define STANDALONE_MIN_BLOCKS_PER_SM(threads_per_block, blocks_per_sm) \ - ((((threads_per_block) * (blocks_per_sm) <= CUDA_MAX_THREADS_PER_SM) \ - ? (blocks_per_sm) \ - : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block) - 1) / \ - (threads_per_block)))) -// STANDALONE_LAUNCH_BOUNDS is analogous to __launch_bounds__ -#define STANDALONE_LAUNCH_BOUNDS_0 \ - __launch_bounds__( \ - 256, 4) // default launch bounds that should give good occupancy - // and versatility across all architectures. -#define STANDALONE_LAUNCH_BOUNDS_1(max_threads_per_block) \ - __launch_bounds__((STANDALONE_MAX_THREADS_PER_BLOCK((max_threads_per_block)))) -#define STANDALONE_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm) \ - __launch_bounds__( \ - (STANDALONE_MAX_THREADS_PER_BLOCK((max_threads_per_block))), \ - (STANDALONE_MIN_BLOCKS_PER_SM( \ - (max_threads_per_block), (min_blocks_per_sm)))) -#else -#define STANDALONE_HOST_DEVICE -#define STANDALONE_HOST -#define STANDALONE_DEVICE -#endif - -#define _STANDALONE_PRAGMA__(string) _Pragma(#string) -#define _STANDALONE_PRAGMA_(string) _STANDALONE_PRAGMA__(string) - -#ifdef __clang__ -#define STANDALONE_CLANG_DIAGNOSTIC_PUSH() _Pragma("clang diagnostic push") -#define STANDALONE_CLANG_DIAGNOSTIC_POP() _Pragma("clang diagnostic pop") -#define STANDALONE_CLANG_DIAGNOSTIC_IGNORE(flag) \ - _STANDALONE_PRAGMA_(clang diagnostic ignored flag) -#define STANDALONE_CLANG_HAS_WARNING(flag) __has_warning(flag) -#else -#define STANDALONE_CLANG_DIAGNOSTIC_PUSH() -#define STANDALONE_CLANG_DIAGNOSTIC_POP() -#define STANDALONE_CLANG_DIAGNOSTIC_IGNORE(flag) -#define STANDALONE_CLANG_HAS_WARNING(flag) 0 +// This header bridges between SlimTensor's STANDALONE_* macro conventions and +// ExecuTorch's C10_* macros from portable_type. It includes the base macros +// from portable_type and provides STANDALONE_* aliases for backward +// compatibility. + +#include + +// ============================================================================= +// STANDALONE_* to C10_* macro mappings +// ============================================================================= +// These mappings allow SlimTensor code to use STANDALONE_* macros while +// actually using the underlying C10_* implementations from portable_type. + +// Host/Device macros +#define STANDALONE_HOST_DEVICE C10_HOST_DEVICE +#define STANDALONE_DEVICE C10_DEVICE +#define STANDALONE_HOST C10_HOST + +// Compiler hint macros +#define STANDALONE_LIKELY C10_LIKELY +#define STANDALONE_UNLIKELY C10_UNLIKELY +#define STANDALONE_UNLIKELY_OR_CONST C10_UNLIKELY + +// String/concatenation macros +#define STANDALONE_STRINGIZE_IMPL C10_STRINGIZE_IMPL +#define STANDALONE_STRINGIZE C10_STRINGIZE +#define STANDALONE_CONCATENATE_IMPL C10_CONCATENATE_IMPL +#define STANDALONE_CONCATENATE C10_CONCATENATE + +// Anonymous variable macros +#define STANDALONE_UID C10_UID +#define STANDALONE_ANONYMOUS_VARIABLE C10_ANONYMOUS_VARIABLE + +// MSVC workaround +#define STANDALONE_EXPAND_MSVC_WORKAROUND C10_MACRO_EXPAND + +// Inline/visibility macros +#define STANDALONE_NOINLINE C10_NOINLINE +#define STANDALONE_ALWAYS_INLINE C10_ALWAYS_INLINE +#define STANDALONE_ALWAYS_INLINE_ATTRIBUTE C10_ALWAYS_INLINE_ATTRIBUTE +#define STANDALONE_ATTR_VISIBILITY_HIDDEN C10_ATTR_VISIBILITY_HIDDEN +#define STANDALONE_ERASE C10_ERASE + +// Clang diagnostic macros +#define STANDALONE_CLANG_DIAGNOSTIC_PUSH C10_CLANG_DIAGNOSTIC_PUSH +#define STANDALONE_CLANG_DIAGNOSTIC_POP C10_CLANG_DIAGNOSTIC_POP +#define STANDALONE_CLANG_DIAGNOSTIC_IGNORE C10_CLANG_DIAGNOSTIC_IGNORE +#define STANDALONE_CLANG_HAS_WARNING C10_CLANG_HAS_WARNING + +// CUDA launch bounds (these are identical between STANDALONE and C10) +#ifdef __CUDACC__ +#define STANDALONE_MAX_THREADS_PER_BLOCK C10_MAX_THREADS_PER_BLOCK +#define STANDALONE_MIN_BLOCKS_PER_SM C10_MIN_BLOCKS_PER_SM +#define STANDALONE_LAUNCH_BOUNDS_0 C10_LAUNCH_BOUNDS_0 +#define STANDALONE_LAUNCH_BOUNDS_1 C10_LAUNCH_BOUNDS_1 +#define STANDALONE_LAUNCH_BOUNDS_2 C10_LAUNCH_BOUNDS_2 #endif diff --git a/backends/aoti/slim/c10/targets.bzl b/backends/aoti/slim/c10/targets.bzl index c2f28bdfb45..47c157d82ab 100644 --- a/backends/aoti/slim/c10/targets.bzl +++ b/backends/aoti/slim/c10/targets.bzl @@ -4,6 +4,26 @@ def define_common_targets(): """Define c10 library targets for SlimTensor. These are portable c10 utilities adapted from torchnative/standalone. + Many utility headers are now thin wrappers that reuse implementations + from executorch/runtime/core/portable_type/c10/c10. + + Headers reused from portable_type/c10: + - Macros.h (with STANDALONE_* -> C10_* mappings) + - bit_cast.h + - irange.h + - floating_point_utils.h + - TypeSafeSignMath.h + - llvmMathExtras.h + - safe_numerics.h + + SlimTensor-specific headers (kept due to additional features): + - Half.h, Half-inl.h (SlimTensor has its own inline implementation) + - BFloat16.h, BFloat16-inl.h, BFloat16-math.h + - complex.h (has complex specialization) + - Float8_* types (not in portable_type) + - Quantized types (qint8, quint8, etc.) + - Array.h, accumulate.h (SlimTensor-specific utilities) + - core/* headers (Device, Scalar, ScalarType, etc.) """ # c10 utility headers (ArrayRef, Half, BFloat16, complex, etc.) @@ -18,6 +38,8 @@ def define_common_targets(): exported_deps = [ "//executorch/runtime/core:core", "//executorch/runtime/platform:platform", + # Reuse c10 utility implementations from portable_type + "//executorch/runtime/core/portable_type/c10/c10:c10", ], ) diff --git a/backends/aoti/slim/c10/util/TypeSafeSignMath.h b/backends/aoti/slim/c10/util/TypeSafeSignMath.h index 7e23f64a39e..c70e27c0fd9 100644 --- a/backends/aoti/slim/c10/util/TypeSafeSignMath.h +++ b/backends/aoti/slim/c10/util/TypeSafeSignMath.h @@ -1,144 +1,26 @@ #pragma once -#include - -#include -#include - -STANDALONE_CLANG_DIAGNOSTIC_PUSH() -#if STANDALONE_CLANG_HAS_WARNING("-Wstring-conversion") -STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wstring-conversion") -#endif -#if STANDALONE_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion") -STANDALONE_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion") -#endif - -namespace executorch::backends::aoti::slim::c10 { - -/// Returns false since we cannot have x < 0 if x is unsigned. -template -inline constexpr bool is_negative( - const T& /*x*/, - std::true_type /*is_unsigned*/) { - return false; -} - -/// Returns true if a signed variable x < 0 -template -inline constexpr bool is_negative(const T& x, std::false_type /*is_unsigned*/) { - return x < T(0); -} - -/// Returns true if x < 0 -/// NOTE: Will fail on an unsigned custom type -/// For the most part it's possible to fix this if -/// the custom type has a constexpr constructor. -/// However, notably, executorch::backends::aoti::slim::c10::Half does not -/// :-( -template -inline constexpr bool is_negative(const T& x) { - return is_negative(x, std::is_unsigned()); -} - -/// Returns the sign of an unsigned variable x as 0, 1 -template -inline constexpr int signum(const T& x, std::true_type /*is_unsigned*/) { - return T(0) < x; -} - -/// Returns the sign of a signed variable x as -1, 0, 1 -template -inline constexpr int signum(const T& x, std::false_type /*is_unsigned*/) { - return (T(0) < x) - (x < T(0)); -} - -/// Returns the sign of x as -1, 0, 1 -/// NOTE: Will fail on an unsigned custom type -/// For the most part it's possible to fix this if -/// the custom type has a constexpr constructor. -/// However, notably, executorch::backends::aoti::slim::c10::Half does not -/// :-( -template -inline constexpr int signum(const T& x) { - return signum(x, std::is_unsigned()); -} - -/// Returns true if a and b are not both negative -template -inline constexpr bool signs_differ(const T& a, const U& b) { - return is_negative(a) != is_negative(b); -} - -// Suppress sign compare warning when compiling with GCC -// as later does not account for short-circuit rule before -// raising the warning, see https://godbolt.org/z/Tr3Msnz99 -#ifdef __GNUC__ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wsign-compare" -#endif - -/// Returns true if x is greater than the greatest value of the type Limit -template -inline constexpr bool greater_than_max(const T& x) { - constexpr bool can_overflow = - std::numeric_limits::digits > std::numeric_limits::digits; - return can_overflow && x > std::numeric_limits::max(); -} - -#ifdef __GNUC__ -#pragma GCC diagnostic pop -#endif - -/// Returns true if x < lowest(Limit). Standard comparison -template -inline constexpr bool less_than_lowest( - const T& x, - std::false_type /*limit_is_unsigned*/, - std::false_type /*x_is_unsigned*/) { - return x < std::numeric_limits::lowest(); -} - -/// Returns false since all the limit is signed and therefore includes -/// negative values but x cannot be negative because it is unsigned -template -inline constexpr bool less_than_lowest( - const T& /*x*/, - std::false_type /*limit_is_unsigned*/, - std::true_type /*x_is_unsigned*/) { - return false; -} - -/// Returns true if x < 0, where 0 is constructed from T. -/// Limit is not signed, so its lower value is zero -template -inline constexpr bool less_than_lowest( - const T& x, - std::true_type /*limit_is_unsigned*/, - std::false_type /*x_is_unsigned*/) { - return x < T(0); -} - -/// Returns false sign both types are unsigned -template -inline constexpr bool less_than_lowest( - const T& /*x*/, - std::true_type /*limit_is_unsigned*/, - std::true_type /*x_is_unsigned*/) { - return false; -} - -/// Returns true if x is less than the lowest value of type T -/// NOTE: Will fail on an unsigned custom type -/// For the most part it's possible to fix this if -/// the custom type has a constexpr constructor. -/// However, notably, executorch::backends::aoti::slim::c10::Half does not -/// : -template -inline constexpr bool less_than_lowest(const T& x) { - return less_than_lowest( - x, std::is_unsigned(), std::is_unsigned()); -} - -} // namespace executorch::backends::aoti::slim::c10 - -STANDALONE_CLANG_DIAGNOSTIC_POP() +// Thin wrapper to reuse ExecuTorch's c10 TypeSafeSignMath implementation. +// This provides backward compatibility for SlimTensor code that uses +// executorch::backends::aoti::slim::c10::{is_negative, signum, signs_differ, +// greater_than_max, less_than_lowest}. + +#include + +namespace executorch { +namespace backends { +namespace aoti { +namespace slim { +namespace c10 { + +using ::c10::greater_than_max; +using ::c10::is_negative; +using ::c10::less_than_lowest; +using ::c10::signs_differ; +using ::c10::signum; + +} // namespace c10 +} // namespace slim +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/slim/c10/util/bit_cast.h b/backends/aoti/slim/c10/util/bit_cast.h index 5a1e1208acf..c40729c7c73 100644 --- a/backends/aoti/slim/c10/util/bit_cast.h +++ b/backends/aoti/slim/c10/util/bit_cast.h @@ -1,44 +1,21 @@ #pragma once -#include -#include +// Thin wrapper to reuse ExecuTorch's c10::bit_cast implementation. +// This provides backward compatibility for SlimTensor code that uses +// executorch::backends::aoti::slim::c10::bit_cast. -#if __has_include() && (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L) -#include -#define STANDALONE_HAVE_STD_BIT_CAST 1 -#else -#define STANDALONE_HAVE_STD_BIT_CAST 0 -#endif // __has_include() && (__cplusplus >= 202002L || - // (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L)) +#include -namespace executorch::backends::aoti::slim::c10 { +namespace executorch { +namespace backends { +namespace aoti { +namespace slim { +namespace c10 { -#if STANDALONE_HAVE_STD_BIT_CAST -using std::bit_cast; -#else -// Implementations of std::bit_cast() from C++ 20. -// -// This is a less sketchy version of reinterpret_cast. -// -// See https://en.cppreference.com/w/cpp/numeric/bit_cast for more -// information as well as the source of our implementations. -template -std::enable_if_t< - sizeof(To) == sizeof(From) && std::is_trivially_copyable_v && - std::is_trivially_copyable_v, - To> -// constexpr support needs compiler magic -bit_cast(const From& src) noexcept { - static_assert( - std::is_trivially_constructible_v, - "This implementation additionally requires " - "destination type to be trivially constructible"); +using ::c10::bit_cast; - To dst; - std::memcpy(&dst, &src, sizeof(To)); - return dst; -} -#endif // STANDALONE_HAVE_STD_BIT_CAST -#undef STANDALONE_HAVE_STD_BIT_CAST - -} // namespace executorch::backends::aoti::slim::c10 +} // namespace c10 +} // namespace slim +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/slim/c10/util/floating_point_utils.h b/backends/aoti/slim/c10/util/floating_point_utils.h index dbe208b05b9..bb146b21ae5 100644 --- a/backends/aoti/slim/c10/util/floating_point_utils.h +++ b/backends/aoti/slim/c10/util/floating_point_utils.h @@ -1,33 +1,25 @@ #pragma once -#include -#include -#include +// Thin wrapper to reuse ExecuTorch's c10 floating_point_utils implementation. +// This provides backward compatibility for SlimTensor code that uses +// executorch::backends::aoti::slim::c10::detail::{fp32_from_bits, +// fp32_to_bits}. -namespace executorch::backends::aoti::slim::c10::detail { +#include -STANDALONE_HOST_DEVICE inline float fp32_from_bits(uint32_t w) { -#if defined(__OPENCL_VERSION__) - return as_float(w); -#elif defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - return __uint_as_float((unsigned int)w); -#elif defined(__INTEL_COMPILER) - return _castu32_f32(w); -#else - return executorch::backends::aoti::slim::c10::bit_cast(w); -#endif -} +namespace executorch { +namespace backends { +namespace aoti { +namespace slim { +namespace c10 { +namespace detail { -STANDALONE_HOST_DEVICE inline uint32_t fp32_to_bits(float f) { -#if defined(__OPENCL_VERSION__) - return as_uint(f); -#elif defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - return (uint32_t)__float_as_uint(f); -#elif defined(__INTEL_COMPILER) - return _castf32_u32(f); -#else - return executorch::backends::aoti::slim::c10::bit_cast(f); -#endif -} +using ::c10::detail::fp32_from_bits; +using ::c10::detail::fp32_to_bits; -} // namespace executorch::backends::aoti::slim::c10::detail +} // namespace detail +} // namespace c10 +} // namespace slim +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/slim/c10/util/irange.h b/backends/aoti/slim/c10/util/irange.h index 75c8b48d1ca..e199cd92012 100644 --- a/backends/aoti/slim/c10/util/irange.h +++ b/backends/aoti/slim/c10/util/irange.h @@ -1,123 +1,21 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once -#include - -#include -#include -#include -#include - -namespace executorch::backends::aoti::slim::c10 { - -namespace detail { - -template < - typename I, - bool one_sided = false, - std::enable_if_t, int> = 0> -struct integer_iterator { - using iterator_category = std::input_iterator_tag; - using value_type = I; - using difference_type = std::ptrdiff_t; - using pointer = I*; - using reference = I&; - - explicit constexpr integer_iterator(I value) : value(value) {} - - constexpr I operator*() const { - return value; - } - - constexpr I const* operator->() const { - return &value; - } - - constexpr integer_iterator& operator++() { - ++value; - return *this; - } - - constexpr integer_iterator operator++(int) { - const auto copy = *this; - ++*this; - return copy; - } - - constexpr bool operator==(const integer_iterator& other) const { - if constexpr (one_sided) { - // Range-for loops' end test is `begin != end`, not `begin < - // end`. To handle `executorch::backends::aoti::slim::c10::irange(n)` - // where n < 0 (which should be empty), we just make `begin != end` fail - // whenever `end` is negative. - return is_negative(other.value) || value == other.value; - } else { - return value == other.value; - } - // Suppress "warning: missing return statement at end of non-void function" - // which Nvidia's Robert Crovella confirms is an NVCC compiler error - // here https://stackoverflow.com/a/64561686/752843 on 2020-10-27 - // `__builtin_unreachable();` would be best here, but it's not - // available with all compilers. So we instead return an arbitrary - // value trusting that this line will, in fact, never be reached. - return false; // Horrible hack - } - - constexpr bool operator!=(const integer_iterator& other) const { - return !(*this == other); - } - - protected: - I value; -}; - -} // namespace detail - -template < - typename I, - bool one_sided = false, - std::enable_if_t, bool> = true> -struct integer_range { - public: - constexpr integer_range(I begin, I end) : begin_(begin), end_(end) {} - using iterator = detail::integer_iterator; - constexpr iterator begin() const { - return begin_; - } - constexpr iterator end() const { - return end_; - } +// Thin wrapper to reuse ExecuTorch's c10::irange implementation. +// This provides backward compatibility for SlimTensor code that uses +// executorch::backends::aoti::slim::c10::irange. - private: - iterator begin_; - iterator end_; -}; +#include -/// Creates an integer range for the half-open interval [begin, end) -/// If end<=begin, then the range is empty. -/// The range has the type of the `end` integer; `begin` integer is -/// cast to this type. -template < - typename Integer1, - typename Integer2, - std::enable_if_t, bool> = true, - std::enable_if_t, bool> = true> -constexpr integer_range irange(Integer1 begin, Integer2 end) { - // If end<=begin then the range is empty; we can achieve this effect by - // choosing the larger of {begin, end} as the loop terminator - return { - static_cast(begin), - std::max(static_cast(begin), end)}; -} +namespace executorch { +namespace backends { +namespace aoti { +namespace slim { +namespace c10 { -/// Creates an integer range for the half-open interval [0, end) -/// If end<=begin, then the range is empty -template < - typename Integer, - std::enable_if_t, bool> = true> -constexpr integer_range irange(Integer end) { - return {Integer(), end}; -} +using ::c10::irange; -} // namespace executorch::backends::aoti::slim::c10 +} // namespace c10 +} // namespace slim +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/slim/c10/util/llvmMathExtras.h b/backends/aoti/slim/c10/util/llvmMathExtras.h index a42423d009d..537983dd4f9 100644 --- a/backends/aoti/slim/c10/util/llvmMathExtras.h +++ b/backends/aoti/slim/c10/util/llvmMathExtras.h @@ -1,899 +1,25 @@ -//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains some functions that are useful for math stuff. -// -//===----------------------------------------------------------------------===// - #pragma once -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef __ANDROID_NDK__ -#include -#endif - -#ifndef __has_builtin -#define __has_builtin(x) 0 -#endif - -#ifndef LLVM_GNUC_PREREQ -#if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__) -#define LLVM_GNUC_PREREQ(maj, min, patch) \ - ((__GNUC__ << 20) + (__GNUC_MINOR__ << 10) + __GNUC_PATCHLEVEL__ >= \ - ((maj) << 20) + ((min) << 10) + (patch)) -#elif defined(__GNUC__) && defined(__GNUC_MINOR__) -#define LLVM_GNUC_PREREQ(maj, min, patch) \ - ((__GNUC__ << 20) + (__GNUC_MINOR__ << 10) >= ((maj) << 20) + ((min) << 10)) -#else -#define LLVM_GNUC_PREREQ(maj, min, patch) 0 -#endif -#endif - -#ifdef _MSC_VER -// Declare these intrinsics manually rather including intrin.h. It's very -// expensive, and MathExtras.h is popular. -// #include -extern "C" { -unsigned char _BitScanForward(unsigned long* _Index, unsigned long _Mask); -unsigned char _BitScanForward64(unsigned long* _Index, unsigned __int64 _Mask); -unsigned char _BitScanReverse(unsigned long* _Index, unsigned long _Mask); -unsigned char _BitScanReverse64(unsigned long* _Index, unsigned __int64 _Mask); -} -#endif - -namespace executorch::backends::aoti::slim::c10::llvm { -/// The behavior an operation has on an input of 0. -enum ZeroBehavior { - /// The returned value is undefined. - ZB_Undefined, - /// The returned value is numeric_limits::max() - ZB_Max, - /// The returned value is numeric_limits::digits - ZB_Width -}; - -namespace detail { -template -struct TrailingZerosCounter { - static std::size_t count(T Val, ZeroBehavior) { - if (!Val) - return std::numeric_limits::digits; - if (Val & 0x1) - return 0; - - // Bisection method. - std::size_t ZeroBits = 0; - T Shift = std::numeric_limits::digits >> 1; - T Mask = std::numeric_limits::max() >> Shift; - while (Shift) { - if ((Val & Mask) == 0) { - Val >>= Shift; - ZeroBits |= Shift; - } - Shift >>= 1; - Mask >>= Shift; - } - return ZeroBits; - } -}; - -#if (defined(__GNUC__) && __GNUC__ >= 4) || defined(_MSC_VER) -template -struct TrailingZerosCounter { - static std::size_t count(T Val, ZeroBehavior ZB) { - if (ZB != ZB_Undefined && Val == 0) - return 32; - -#if __has_builtin(__builtin_ctz) || LLVM_GNUC_PREREQ(4, 0, 0) - return __builtin_ctz(Val); -#elif defined(_MSC_VER) - unsigned long Index; - _BitScanForward(&Index, Val); - return Index; -#endif - } -}; - -#if !defined(_MSC_VER) || defined(_M_X64) -template -struct TrailingZerosCounter { - static std::size_t count(T Val, ZeroBehavior ZB) { - if (ZB != ZB_Undefined && Val == 0) - return 64; - -#if __has_builtin(__builtin_ctzll) || LLVM_GNUC_PREREQ(4, 0, 0) - return __builtin_ctzll(Val); -#elif defined(_MSC_VER) - unsigned long Index; - _BitScanForward64(&Index, Val); - return Index; -#endif - } -}; -#endif -#endif -} // namespace detail - -/// Count number of 0's from the least significant bit to the most -/// stopping at the first 1. -/// -/// Only unsigned integral types are allowed. -/// -/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are -/// valid arguments. -template -std::size_t countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) { - static_assert( - std::numeric_limits::is_integer && !std::numeric_limits::is_signed, - "Only unsigned integral types are allowed."); - return llvm::detail::TrailingZerosCounter::count(Val, ZB); -} - -namespace detail { -template -struct LeadingZerosCounter { - static std::size_t count(T Val, ZeroBehavior) { - if (!Val) - return std::numeric_limits::digits; - - // Bisection method. - std::size_t ZeroBits = 0; - for (T Shift = std::numeric_limits::digits >> 1; Shift; Shift >>= 1) { - T Tmp = Val >> Shift; - if (Tmp) - Val = Tmp; - else - ZeroBits |= Shift; - } - return ZeroBits; - } -}; - -#if (defined(__GNUC__) && __GNUC__ >= 4) || defined(_MSC_VER) -template -struct LeadingZerosCounter { - static std::size_t count(T Val, ZeroBehavior ZB) { - if (ZB != ZB_Undefined && Val == 0) - return 32; - -#if __has_builtin(__builtin_clz) || LLVM_GNUC_PREREQ(4, 0, 0) - return __builtin_clz(Val); -#elif defined(_MSC_VER) - unsigned long Index; - _BitScanReverse(&Index, Val); - return Index ^ 31; -#endif - } -}; - -#if !defined(_MSC_VER) || defined(_M_X64) -template -struct LeadingZerosCounter { - static std::size_t count(T Val, ZeroBehavior ZB) { - if (ZB != ZB_Undefined && Val == 0) - return 64; - -#if __has_builtin(__builtin_clzll) || LLVM_GNUC_PREREQ(4, 0, 0) - return __builtin_clzll(Val); -#elif defined(_MSC_VER) - unsigned long Index; - _BitScanReverse64(&Index, Val); - return Index ^ 63; -#endif - } -}; -#endif -#endif -} // namespace detail - -/// Count number of 0's from the most significant bit to the least -/// stopping at the first 1. -/// -/// Only unsigned integral types are allowed. -/// -/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are -/// valid arguments. -template -std::size_t countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) { - static_assert( - std::numeric_limits::is_integer && !std::numeric_limits::is_signed, - "Only unsigned integral types are allowed."); - return llvm::detail::LeadingZerosCounter::count(Val, ZB); -} - -/// Get the index of the first set bit starting from the least -/// significant bit. -/// -/// Only unsigned integral types are allowed. -/// -/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are -/// valid arguments. -template -T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) { - if (ZB == ZB_Max && Val == 0) - return std::numeric_limits::max(); - - return countTrailingZeros(Val, ZB_Undefined); -} - -/// Create a bitmask with the N right-most bits set to 1, and all other -/// bits set to 0. Only unsigned types are allowed. -template -T maskTrailingOnes(unsigned N) { - static_assert(std::is_unsigned_v, "Invalid type!"); - const unsigned Bits = CHAR_BIT * sizeof(T); - assert(N <= Bits && "Invalid bit index"); - return N == 0 ? 0 : (T(-1) >> (Bits - N)); -} - -/// Create a bitmask with the N left-most bits set to 1, and all other -/// bits set to 0. Only unsigned types are allowed. -template -T maskLeadingOnes(unsigned N) { - return ~maskTrailingOnes(CHAR_BIT * sizeof(T) - N); -} - -/// Create a bitmask with the N right-most bits set to 0, and all other -/// bits set to 1. Only unsigned types are allowed. -template -T maskTrailingZeros(unsigned N) { - return maskLeadingOnes(CHAR_BIT * sizeof(T) - N); -} - -/// Create a bitmask with the N left-most bits set to 0, and all other -/// bits set to 1. Only unsigned types are allowed. -template -T maskLeadingZeros(unsigned N) { - return maskTrailingOnes(CHAR_BIT * sizeof(T) - N); -} - -/// Get the index of the last set bit starting from the least -/// significant bit. -/// -/// Only unsigned integral types are allowed. -/// -/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are -/// valid arguments. -template -T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) { - if (ZB == ZB_Max && Val == 0) - return std::numeric_limits::max(); - - // Use ^ instead of - because both gcc and llvm can remove the associated ^ - // in the __builtin_clz intrinsic on x86. - return countLeadingZeros(Val, ZB_Undefined) ^ - (std::numeric_limits::digits - 1); -} - -/// Macro compressed bit reversal table for 256 bits. -/// -/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable -/// NOLINTNEXTLINE(*c-arrays*) -static constexpr unsigned char BitReverseTable256[256] = { -#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64 -#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16) -#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4) - R6(0), - R6(2), - R6(1), - R6(3) -#undef R2 -#undef R4 -#undef R6 -}; - -/// Reverse the bits in \p Val. -template -T reverseBits(T Val) { - // NOLINTNEXTLINE(*c-arrays*) - unsigned char in[sizeof(Val)]; - // NOLINTNEXTLINE(*c-arrays*) - unsigned char out[sizeof(Val)]; - std::memcpy(in, &Val, sizeof(Val)); - for (unsigned i = 0; i < sizeof(Val); ++i) - out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]]; - std::memcpy(&Val, out, sizeof(Val)); - return Val; -} - -// NOTE: The following support functions use the _32/_64 extensions instead of -// type overloading so that signed and unsigned integers can be used without -// ambiguity. - -/// Return the high 32 bits of a 64 bit value. -constexpr inline uint32_t Hi_32(uint64_t Value) { - return static_cast(Value >> 32); -} - -/// Return the low 32 bits of a 64 bit value. -constexpr inline uint32_t Lo_32(uint64_t Value) { - return static_cast(Value); -} - -/// Make a 64-bit integer from a high / low pair of 32-bit integers. -constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) { - return ((uint64_t)High << 32) | (uint64_t)Low; -} - -/// Checks if an integer fits into the given bit width. -template -constexpr inline bool isInt(int64_t x) { - return N >= 64 || - (-(INT64_C(1) << (N - 1)) <= x && x < (INT64_C(1) << (N - 1))); -} -// Template specializations to get better code for common cases. -template <> -constexpr inline bool isInt<8>(int64_t x) { - return static_cast(x) == x; -} -template <> -constexpr inline bool isInt<16>(int64_t x) { - return static_cast(x) == x; -} -template <> -constexpr inline bool isInt<32>(int64_t x) { - return static_cast(x) == x; -} - -/// Checks if a signed integer is an N bit number shifted left by S. -template -constexpr inline bool isShiftedInt(int64_t x) { - static_assert( - N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number."); - static_assert(N + S <= 64, "isShiftedInt with N + S > 64 is too wide."); - return isInt(x) && (x % (UINT64_C(1) << S) == 0); -} - -/// Checks if an unsigned integer fits into the given bit width. -/// -/// This is written as two functions rather than as simply -/// -/// return N >= 64 || X < (UINT64_C(1) << N); -/// -/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting -/// left too many places. -template -constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) { - static_assert(N > 0, "isUInt<0> doesn't make sense"); - return X < (UINT64_C(1) << (N)); -} -template -constexpr inline std::enable_if_t= 64, bool> isUInt(uint64_t /*X*/) { - return true; -} - -// Template specializations to get better code for common cases. -template <> -constexpr inline bool isUInt<8>(uint64_t x) { - return static_cast(x) == x; -} -template <> -constexpr inline bool isUInt<16>(uint64_t x) { - return static_cast(x) == x; -} -template <> -constexpr inline bool isUInt<32>(uint64_t x) { - return static_cast(x) == x; -} - -/// Checks if a unsigned integer is an N bit number shifted left by S. -template -constexpr inline bool isShiftedUInt(uint64_t x) { - static_assert( - N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)"); - static_assert( - N + S <= 64, "isShiftedUInt with N + S > 64 is too wide."); - // Per the two static_asserts above, S must be strictly less than 64. So - // 1 << S is not undefined behavior. - return isUInt(x) && (x % (UINT64_C(1) << S) == 0); -} - -/// Gets the maximum value for a N-bit unsigned integer. -inline uint64_t maxUIntN(uint64_t N) { - assert(N > 0 && N <= 64 && "integer width out of range"); - - // uint64_t(1) << 64 is undefined behavior, so we can't do - // (uint64_t(1) << N) - 1 - // without checking first that N != 64. But this works and doesn't have a - // branch. - return UINT64_MAX >> (64 - N); -} - -// Ignore the false warning "Arithmetic overflow" for MSVC -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4146) -#endif - -/// Gets the minimum value for a N-bit signed integer. -inline int64_t minIntN(int64_t N) { - assert(N > 0 && N <= 64 && "integer width out of range"); - // NOLINTNEXTLINE(*-narrowing-conversions) - return -(UINT64_C(1) << (N - 1)); -} - -#ifdef _MSC_VER -#pragma warning(pop) -#endif - -/// Gets the maximum value for a N-bit signed integer. -inline int64_t maxIntN(int64_t N) { - assert(N > 0 && N <= 64 && "integer width out of range"); - - // This relies on two's complement wraparound when N == 64, so we convert to - // int64_t only at the very end to avoid UB. - // NOLINTNEXTLINE(*-narrowing-conversions) - return (UINT64_C(1) << (N - 1)) - 1; -} - -/// Checks if an unsigned integer fits into the given (dynamic) bit width. -inline bool isUIntN(unsigned N, uint64_t x) { - return N >= 64 || x <= maxUIntN(N); -} - -/// Checks if an signed integer fits into the given (dynamic) bit width. -inline bool isIntN(unsigned N, int64_t x) { - return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N)); -} - -/// Return true if the argument is a non-empty sequence of ones starting at the -/// least significant bit with the remainder zero (32 bit version). -/// Ex. isMask_32(0x0000FFFFU) == true. -constexpr inline bool isMask_32(uint32_t Value) { - return Value && ((Value + 1) & Value) == 0; -} - -/// Return true if the argument is a non-empty sequence of ones starting at the -/// least significant bit with the remainder zero (64 bit version). -constexpr inline bool isMask_64(uint64_t Value) { - return Value && ((Value + 1) & Value) == 0; -} - -/// Return true if the argument contains a non-empty sequence of ones with the -/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true. -constexpr inline bool isShiftedMask_32(uint32_t Value) { - return Value && isMask_32((Value - 1) | Value); -} - -/// Return true if the argument contains a non-empty sequence of ones with the -/// remainder zero (64 bit version.) -constexpr inline bool isShiftedMask_64(uint64_t Value) { - return Value && isMask_64((Value - 1) | Value); -} - -/// Return true if the argument is a power of two > 0. -/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.) -constexpr inline bool isPowerOf2_32(uint32_t Value) { - return Value && !(Value & (Value - 1)); -} - -/// Return true if the argument is a power of two > 0 (64 bit edition.) -constexpr inline bool isPowerOf2_64(uint64_t Value) { - return Value && !(Value & (Value - 1)); -} - -/// Count the number of ones from the most significant bit to the first -/// zero bit. -/// -/// Ex. countLeadingOnes(0xFF0FFF00) == 8. -/// Only unsigned integral types are allowed. -/// -/// \param ZB the behavior on an input of all ones. Only ZB_Width and -/// ZB_Undefined are valid arguments. -template -std::size_t countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) { - static_assert( - std::numeric_limits::is_integer && !std::numeric_limits::is_signed, - "Only unsigned integral types are allowed."); - return countLeadingZeros(~Value, ZB); -} - -/// Count the number of ones from the least significant bit to the first -/// zero bit. -/// -/// Ex. countTrailingOnes(0x00FF00FF) == 8. -/// Only unsigned integral types are allowed. -/// -/// \param ZB the behavior on an input of all ones. Only ZB_Width and -/// ZB_Undefined are valid arguments. -template -std::size_t countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) { - static_assert( - std::numeric_limits::is_integer && !std::numeric_limits::is_signed, - "Only unsigned integral types are allowed."); - return countTrailingZeros(~Value, ZB); -} - -namespace detail { -template -struct PopulationCounter { - static unsigned count(T Value) { - // Generic version, forward to 32 bits. - static_assert(SizeOfT <= 4, "Not implemented!"); -#if defined(__GNUC__) && __GNUC__ >= 4 - return __builtin_popcount(Value); -#else - uint32_t v = Value; - v = v - ((v >> 1) & 0x55555555); - v = (v & 0x33333333) + ((v >> 2) & 0x33333333); - return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; -#endif - } -}; - -template -struct PopulationCounter { - static unsigned count(T Value) { -#if defined(__GNUC__) && __GNUC__ >= 4 - return __builtin_popcountll(Value); -#else - uint64_t v = Value; - v = v - ((v >> 1) & 0x5555555555555555ULL); - v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL); - v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL; - return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56); -#endif - } -}; -} // namespace detail - -/// Count the number of set bits in a value. -/// Ex. countPopulation(0xF000F000) = 8 -/// Returns 0 if the word is zero. -template -inline unsigned countPopulation(T Value) { - static_assert( - std::numeric_limits::is_integer && !std::numeric_limits::is_signed, - "Only unsigned integral types are allowed."); - return detail::PopulationCounter::count(Value); -} - -/// Return the log base 2 of the specified value. -inline double Log2(double Value) { -#if defined(__ANDROID_API__) && __ANDROID_API__ < 18 - return __builtin_log(Value) / __builtin_log(2.0); -#else - return log2(Value); -#endif -} - -/// Return the floor log base 2 of the specified value, -1 if the value is zero. -/// (32 bit edition.) -/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2 -inline unsigned Log2_32(uint32_t Value) { - return static_cast(31 - countLeadingZeros(Value)); -} - -/// Return the floor log base 2 of the specified value, -1 if the value is zero. -/// (64 bit edition.) -inline unsigned Log2_64(uint64_t Value) { - return static_cast(63 - countLeadingZeros(Value)); -} - -/// Return the ceil log base 2 of the specified value, 32 if the value is zero. -/// (32 bit edition). -/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3 -inline unsigned Log2_32_Ceil(uint32_t Value) { - return static_cast(32 - countLeadingZeros(Value - 1)); -} - -/// Return the ceil log base 2 of the specified value, 64 if the value is zero. -/// (64 bit edition.) -inline unsigned Log2_64_Ceil(uint64_t Value) { - return static_cast(64 - countLeadingZeros(Value - 1)); -} - -/// Return the greatest common divisor of the values using Euclid's algorithm. -inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) { - while (B) { - uint64_t T = B; - B = A % B; - A = T; - } - return A; -} - -/// This function takes a 64-bit integer and returns the bit equivalent double. -inline double BitsToDouble(uint64_t Bits) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - double D; - static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); - memcpy(&D, &Bits, sizeof(Bits)); - return D; -} - -/// This function takes a 32-bit integer and returns the bit equivalent float. -inline float BitsToFloat(uint32_t Bits) { - // TODO: Use std::bit_cast once C++20 becomes available. - return executorch::backends::aoti::slim::c10::bit_cast(Bits); -} - -/// This function takes a double and returns the bit equivalent 64-bit integer. -/// Note that copying doubles around changes the bits of NaNs on some hosts, -/// notably x86, so this routine cannot be used if these bits are needed. -inline uint64_t DoubleToBits(double Double) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - uint64_t Bits; - static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); - memcpy(&Bits, &Double, sizeof(Double)); - return Bits; -} - -/// This function takes a float and returns the bit equivalent 32-bit integer. -/// Note that copying floats around changes the bits of NaNs on some hosts, -/// notably x86, so this routine cannot be used if these bits are needed. -inline uint32_t FloatToBits(float Float) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - uint32_t Bits; - static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); - memcpy(&Bits, &Float, sizeof(Float)); - return Bits; -} - -/// A and B are either alignments or offsets. Return the minimum alignment that -/// may be assumed after adding the two together. -constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) { - // The largest power of 2 that divides both A and B. - // - // Replace "-Value" by "1+~Value" in the following commented code to avoid - // MSVC warning C4146 - // return (A | B) & -(A | B); - return (A | B) & (1 + ~(A | B)); -} - -/// Aligns \c Addr to \c Alignment bytes, rounding up. -/// -/// Alignment should be a power of two. This method rounds up, so -/// alignAddr(7, 4) == 8 and alignAddr(8, 4) == 8. -inline uintptr_t alignAddr(const void* Addr, size_t Alignment) { - assert( - Alignment && isPowerOf2_64((uint64_t)Alignment) && - "Alignment is not a power of two!"); - - assert((uintptr_t)Addr + Alignment - 1 >= (uintptr_t)Addr); - - return (((uintptr_t)Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1)); -} - -/// Returns the necessary adjustment for aligning \c Ptr to \c Alignment -/// bytes, rounding up. -inline size_t alignmentAdjustment(const void* Ptr, size_t Alignment) { - return alignAddr(Ptr, Alignment) - (uintptr_t)Ptr; -} - -/// Returns the next power of two (in 64-bits) that is strictly greater than A. -/// Returns zero on overflow. -inline uint64_t NextPowerOf2(uint64_t A) { - A |= (A >> 1); - A |= (A >> 2); - A |= (A >> 4); - A |= (A >> 8); - A |= (A >> 16); - A |= (A >> 32); - return A + 1; -} - -/// Returns the power of two which is less than or equal to the given value. -/// Essentially, it is a floor operation across the domain of powers of two. -inline uint64_t PowerOf2Floor(uint64_t A) { - if (!A) - return 0; - return 1ull << (63 - countLeadingZeros(A, ZB_Undefined)); -} - -/// Returns the power of two which is greater than or equal to the given value. -/// Essentially, it is a ceil operation across the domain of powers of two. -inline uint64_t PowerOf2Ceil(uint64_t A) { - if (!A) - return 0; - return NextPowerOf2(A - 1); -} - -/// Returns the next integer (mod 2**64) that is greater than or equal to -/// \p Value and is a multiple of \p Align. \p Align must be non-zero. -/// -/// If non-zero \p Skew is specified, the return value will be a minimal -/// integer that is greater than or equal to \p Value and equal to -/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than -/// \p Align, its value is adjusted to '\p Skew mod \p Align'. -/// -/// Examples: -/// \code -/// alignTo(5, 8) = 8 -/// alignTo(17, 8) = 24 -/// alignTo(~0LL, 8) = 0 -/// alignTo(321, 255) = 510 -/// -/// alignTo(5, 8, 7) = 7 -/// alignTo(17, 8, 1) = 17 -/// alignTo(~0LL, 8, 3) = 3 -/// alignTo(321, 255, 42) = 552 -/// \endcode -inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { - assert(Align != 0u && "Align can't be 0."); - Skew %= Align; - return (Value + Align - 1 - Skew) / Align * Align + Skew; -} - -/// Returns the next integer (mod 2**64) that is greater than or equal to -/// \p Value and is a multiple of \c Align. \c Align must be non-zero. -template -constexpr inline uint64_t alignTo(uint64_t Value) { - static_assert(Align != 0u, "Align must be non-zero"); - return (Value + Align - 1) / Align * Align; -} - -/// Returns the integer ceil(Numerator / Denominator). -inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) { - return alignTo(Numerator, Denominator) / Denominator; -} - -/// \c alignTo for contexts where a constant expression is required. -/// \sa alignTo -/// -/// \todo FIXME: remove when \c constexpr becomes really \c constexpr -template -struct AlignTo { - static_assert(Align != 0u, "Align must be non-zero"); - template - struct from_value { - static const uint64_t value = (Value + Align - 1) / Align * Align; - }; -}; - -/// Returns the largest uint64_t less than or equal to \p Value and is -/// \p Skew mod \p Align. \p Align must be non-zero -inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { - assert(Align != 0u && "Align can't be 0."); - Skew %= Align; - return (Value - Skew) / Align * Align + Skew; -} - -/// Returns the offset to the next integer (mod 2**64) that is greater than -/// or equal to \p Value and is a multiple of \p Align. \p Align must be -/// non-zero. -inline uint64_t OffsetToAlignment(uint64_t Value, uint64_t Align) { - return alignTo(Value, Align) - Value; -} - -/// Sign-extend the number in the bottom B bits of X to a 32-bit integer. -/// Requires 0 < B <= 32. -template -constexpr inline int32_t SignExtend32(uint32_t X) { - static_assert(B > 0, "Bit width can't be 0."); - static_assert(B <= 32, "Bit width out of range."); - return int32_t(X << (32 - B)) >> (32 - B); -} - -/// Sign-extend the number in the bottom B bits of X to a 32-bit integer. -/// Requires 0 < B < 32. -inline int32_t SignExtend32(uint32_t X, unsigned B) { - assert(B > 0 && "Bit width can't be 0."); - assert(B <= 32 && "Bit width out of range."); - return int32_t(X << (32 - B)) >> (32 - B); -} - -/// Sign-extend the number in the bottom B bits of X to a 64-bit integer. -/// Requires 0 < B < 64. -template -constexpr inline int64_t SignExtend64(uint64_t x) { - static_assert(B > 0, "Bit width can't be 0."); - static_assert(B <= 64, "Bit width out of range."); - return int64_t(x << (64 - B)) >> (64 - B); -} - -/// Sign-extend the number in the bottom B bits of X to a 64-bit integer. -/// Requires 0 < B < 64. -inline int64_t SignExtend64(uint64_t X, unsigned B) { - assert(B > 0 && "Bit width can't be 0."); - assert(B <= 64 && "Bit width out of range."); - return int64_t(X << (64 - B)) >> (64 - B); -} - -/// Subtract two unsigned integers, X and Y, of type T and return the absolute -/// value of the result. -template -std::enable_if_t, T> AbsoluteDifference(T X, T Y) { - return std::max(X, Y) - std::min(X, Y); -} - -/// Add two unsigned integers, X and Y, of type T. Clamp the result to the -/// maximum representable value of T on overflow. ResultOverflowed indicates if -/// the result is larger than the maximum representable value of type T. -template -std::enable_if_t, T> -SaturatingAdd(T X, T Y, bool* ResultOverflowed = nullptr) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - bool Dummy; - bool& Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; - // Hacker's Delight, p. 29 - T Z = X + Y; - Overflowed = (Z < X || Z < Y); - if (Overflowed) - return std::numeric_limits::max(); - else - return Z; -} - -/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the -/// maximum representable value of T on overflow. ResultOverflowed indicates if -/// the result is larger than the maximum representable value of type T. -template -std::enable_if_t, T> -SaturatingMultiply(T X, T Y, bool* ResultOverflowed = nullptr) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - bool Dummy; - bool& Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; - - // Hacker's Delight, p. 30 has a different algorithm, but we don't use that - // because it fails for uint16_t (where multiplication can have undefined - // behavior due to promotion to int), and requires a division in addition - // to the multiplication. - - Overflowed = false; - - // Log2(Z) would be either Log2Z or Log2Z + 1. - // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z - // will necessarily be less than Log2Max as desired. - int Log2Z = Log2_64(X) + Log2_64(Y); - const T Max = std::numeric_limits::max(); - int Log2Max = Log2_64(Max); - if (Log2Z < Log2Max) { - return X * Y; - } - if (Log2Z > Log2Max) { - Overflowed = true; - return Max; - } - - // We're going to use the top bit, and maybe overflow one - // bit past it. Multiply all but the bottom bit then add - // that on at the end. - T Z = (X >> 1) * Y; - if (Z & ~(Max >> 1)) { - Overflowed = true; - return Max; - } - Z <<= 1; - if (X & 1) - return SaturatingAdd(Z, Y, ResultOverflowed); - - return Z; -} - -/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to -/// the product. Clamp the result to the maximum representable value of T on -/// overflow. ResultOverflowed indicates if the result is larger than the -/// maximum representable value of type T. -template -std::enable_if_t, T> -SaturatingMultiplyAdd(T X, T Y, T A, bool* ResultOverflowed = nullptr) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - bool Dummy; - bool& Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; - - T Product = SaturatingMultiply(X, Y, &Overflowed); - if (Overflowed) - return Product; - - return SaturatingAdd(A, Product, &Overflowed); -} - -/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC. -extern const float huge_valf; -} // namespace executorch::backends::aoti::slim::c10::llvm +// Thin wrapper to reuse ExecuTorch's c10 llvmMathExtras implementation. +// This provides backward compatibility for SlimTensor code that uses +// executorch::backends::aoti::slim::c10::llvm functions. + +#include + +namespace executorch { +namespace backends { +namespace aoti { +namespace slim { +namespace c10 { +namespace llvm { + +using ::c10::llvm::Hi_32; +using ::c10::llvm::Lo_32; +using ::c10::llvm::Make_64; + +} // namespace llvm +} // namespace c10 +} // namespace slim +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/slim/c10/util/safe_numerics.h b/backends/aoti/slim/c10/util/safe_numerics.h index df0aa6e7c5c..c5dbd24dd57 100644 --- a/backends/aoti/slim/c10/util/safe_numerics.h +++ b/backends/aoti/slim/c10/util/safe_numerics.h @@ -1,95 +1,27 @@ #pragma once -#include -#include - -// GCC has __builtin_mul_overflow from before it supported __has_builtin -#ifdef _MSC_VER -#define STANDALONE_HAS_BUILTIN_OVERFLOW() (0) -#include -#include -#else -#define STANDALONE_HAS_BUILTIN_OVERFLOW() (1) -#endif - -namespace executorch::backends::aoti::slim::c10 { - -STANDALONE_ALWAYS_INLINE bool -add_overflows(uint64_t a, uint64_t b, uint64_t* out) { -#if STANDALONE_HAS_BUILTIN_OVERFLOW() - return __builtin_add_overflow(a, b, out); -#else - unsigned long long tmp; -#if defined(_M_IX86) || defined(_M_X64) - auto carry = _addcarry_u64(0, a, b, &tmp); -#else - tmp = a + b; - unsigned long long vector = (a & b) ^ ((a ^ b) & ~tmp); - auto carry = vector >> 63; -#endif - *out = tmp; - return carry; -#endif -} - -STANDALONE_ALWAYS_INLINE bool -mul_overflows(uint64_t a, uint64_t b, uint64_t* out) { -#if STANDALONE_HAS_BUILTIN_OVERFLOW() - return __builtin_mul_overflow(a, b, out); -#else - *out = a * b; - // This test isnt exact, but avoids doing integer division - return ( - (executorch::backends::aoti::slim::c10::llvm::countLeadingZeros(a) + - executorch::backends::aoti::slim::c10::llvm::countLeadingZeros(b)) < 64); -#endif -} - -STANDALONE_ALWAYS_INLINE bool -mul_overflows(int64_t a, int64_t b, int64_t* out) { -#if STANDALONE_HAS_BUILTIN_OVERFLOW() - return __builtin_mul_overflow(a, b, out); -#else - volatile int64_t tmp = a * b; - *out = tmp; - if (a == 0 || b == 0) { - return false; - } - return !(a == tmp / b); -#endif -} - -template -bool safe_multiplies_u64(It first, It last, uint64_t* out) { -#if STANDALONE_HAS_BUILTIN_OVERFLOW() - uint64_t prod = 1; - bool overflow = false; - for (; first != last; ++first) { - overflow |= executorch::backends::aoti::slim::c10::mul_overflows( - prod, *first, &prod); - } - *out = prod; - return overflow; -#else - uint64_t prod = 1; - uint64_t prod_log2 = 0; - bool is_zero = false; - for (; first != last; ++first) { - auto x = static_cast(*first); - prod *= x; - // log2(0) isn't valid, so need to track it specially - is_zero |= (x == 0); - prod_log2 += executorch::backends::aoti::slim::c10::llvm::Log2_64_Ceil(x); - } - *out = prod; - // This test isnt exact, but avoids doing integer division - return !is_zero && (prod_log2 >= 64); -#endif -} - -template -bool safe_multiplies_u64(const Container& c, uint64_t* out) { - return safe_multiplies_u64(c.begin(), c.end(), out); -} - -} // namespace executorch::backends::aoti::slim::c10 +// Thin wrapper to reuse ExecuTorch's c10 safe_numerics implementation. +// This provides backward compatibility for SlimTensor code that uses +// executorch::backends::aoti::slim::c10::{safe_multiplies_u64, add_overflows, +// mul_overflows}. +// +// NOTE: multiply_integers is defined in accumulate.h (SlimTensor-specific). +// NOTE: sub_overflows is not available in ET c10 safe_numerics. + +#include + +namespace executorch { +namespace backends { +namespace aoti { +namespace slim { +namespace c10 { + +using ::c10::add_overflows; +using ::c10::mul_overflows; +using ::c10::safe_multiplies_u64; + +} // namespace c10 +} // namespace slim +} // namespace aoti +} // namespace backends +} // namespace executorch