From 18afdaf862317edfbe16df73477daf4888b55519 Mon Sep 17 00:00:00 2001 From: Yuanyuan Chen Date: Tue, 23 Dec 2025 17:04:00 +0800 Subject: [PATCH] Apply more tidy fixes Signed-off-by: Yuanyuan Chen --- .../raw_embedding_streamer.h | 20 +- .../fbgemm_gpu/utils/tensor_accessor.h | 2 +- fbgemm_gpu/src/config/feature_gates.cpp | 2 +- .../embedding_inplace_update_cpu.cpp | 12 +- .../src/faster_hash_ops/faster_hash.cpp | 2 +- .../input_combine_ops/input_combine_cpu.cpp | 14 +- .../input_combine_ops/input_combine_gpu.cpp | 6 +- .../jagged_tensor_ops_autograd.cpp | 4 +- .../jagged_tensor_ops_cpu.cpp | 195 +++++++++--------- .../jagged_tensor_ops_meta.cpp | 36 ++-- .../layout_transform_ops_cpu.cpp | 2 +- .../merge_pooled_embedding_ops_cpu.cpp | 6 +- .../permute_pooled_embedding_ops_cpu.cpp | 10 +- .../permute_pooled_embedding_ops_gpu.cpp | 4 +- ...permute_pooled_embedding_ops_split_cpu.cpp | 10 +- .../src/quantize_ops/quantize_ops_cpu.cpp | 18 +- .../src/quantize_ops/quantize_ops_meta.cpp | 6 +- .../sparse_async_batched_cumsum.cpp | 2 +- .../src/sparse_ops/sparse_async_cumsum.cpp | 2 +- .../raw_embedding_streamer.cpp | 14 +- fbgemm_gpu/src/tbe/eeg/eeg_models.h | 14 +- fbgemm_gpu/src/topology_utils.cpp | 2 +- 22 files changed, 190 insertions(+), 193 deletions(-) diff --git a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache/raw_embedding_streamer.h b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache/raw_embedding_streamer.h index 804535a5d8..bf4f9c7b6e 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache/raw_embedding_streamer.h +++ b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache/raw_embedding_streamer.h @@ -22,18 +22,14 @@ struct StreamQueueItem { std::optional identities; std::optional runtime_meta; at::Tensor count; - StreamQueueItem( - at::Tensor src_indices, - at::Tensor src_weights, - std::optional src_identities, - std::optional src_runtime_meta, - at::Tensor src_count) { - indices = std::move(src_indices); - weights = std::move(src_weights); - identities = std::move(src_identities); - runtime_meta = std::move(src_runtime_meta); - count = std::move(src_count); - } + StreamQueueItem(at::Tensor src_indices, at::Tensor src_weights, + std::optional src_identities, + std::optional src_runtime_meta, + at::Tensor src_count) + : indices(std::move(src_indices)), count(std::move(src_count)), + identities(std::move(src_identities)), + runtime_meta(std::move(src_runtime_meta)), + weights(std::move(src_weights)) {} }; class RawEmbeddingStreamer : public torch::jit::CustomClassHolder { diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/tensor_accessor.h b/fbgemm_gpu/include/fbgemm_gpu/utils/tensor_accessor.h index 4731150945..c3825b0888 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/utils/tensor_accessor.h +++ b/fbgemm_gpu/include/fbgemm_gpu/utils/tensor_accessor.h @@ -327,7 +327,7 @@ class PackedTensorAccessor } protected: - size_t numel_; + size_t numel_{}; char name_[NAME_MAX_LEN]; char context_[CONTEXT_MAX_LEN]; diff --git a/fbgemm_gpu/src/config/feature_gates.cpp b/fbgemm_gpu/src/config/feature_gates.cpp index cdf61d2032..a43e8cc2e7 100644 --- a/fbgemm_gpu/src/config/feature_gates.cpp +++ b/fbgemm_gpu/src/config/feature_gates.cpp @@ -31,7 +31,7 @@ std::string to_string(const FeatureGateName& value) { return "UNKNOWN"; } -bool ev_check_key(const std::string& key) { +static bool ev_check_key(const std::string& key) { const auto env_var = "FBGEMM_" + key; const auto value = std::getenv(env_var.c_str()); diff --git a/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp index 5309a78f7b..5421749cad 100644 --- a/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp +++ b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp @@ -23,7 +23,7 @@ using Tensor = at::Tensor; namespace fbgemm_gpu { template -void embedding_inplace_update_cpu_kernel( +static void embedding_inplace_update_cpu_kernel( at::TensorAccessor dev_weights, at::TensorAccessor uvm_weights, const at::TensorAccessor& weights_placements, @@ -151,9 +151,9 @@ void dram_kv_embedding_inplace_update_cpu( const uint8_t* weights_tys_ptr = weights_tys.data_ptr(); const int32_t* D_offsets_ptr = D_offsets.data_ptr(); - const uint8_t* update_weights_ptr = update_weights.data_ptr(); + uint8_t* update_weights_ptr = update_weights.mutable_data_ptr(); const int32_t* update_table_idx_ptr = update_table_idx.data_ptr(); - const int64_t* update_row_idx_ptr = update_row_idx.data_ptr(); + int64_t* update_row_idx_ptr = update_row_idx.mutable_data_ptr(); const int64_t* update_offsets_ptr = update_offsets.data_ptr(); int64_t window_start = 0; @@ -172,15 +172,13 @@ void dram_kv_embedding_inplace_update_cpu( int32_t D_bytes = nbit::padded_row_size_in_bytes(D, weight_ty, row_alignment); - uint8_t* batched_weights_ptr = const_cast( - update_weights_ptr + update_offsets_ptr[window_start]); + uint8_t* batched_weights_ptr = update_weights_ptr + update_offsets_ptr[window_start]; auto weights_tensor = at::from_blob( batched_weights_ptr, {window_size, D_bytes}, at::TensorOptions().dtype(at::kByte)); - int64_t* row_ids_ptr = - const_cast(update_row_idx_ptr + window_start); + int64_t* row_ids_ptr = update_row_idx_ptr + window_start; auto row_id_tensor = at::from_blob( row_ids_ptr, {window_size}, at::TensorOptions().dtype(at::kLong)); diff --git a/fbgemm_gpu/src/faster_hash_ops/faster_hash.cpp b/fbgemm_gpu/src/faster_hash_ops/faster_hash.cpp index b00b861325..12899f88e4 100644 --- a/fbgemm_gpu/src/faster_hash_ops/faster_hash.cpp +++ b/fbgemm_gpu/src/faster_hash_ops/faster_hash.cpp @@ -263,7 +263,7 @@ std::tuple create_zch_buffer_cpu( return {identity, metadata}; } -void zero_collision_hash_cpu_out( +static void zero_collision_hash_cpu_out( Tensor& output, const Tensor& input, const Tensor& identities, diff --git a/fbgemm_gpu/src/input_combine_ops/input_combine_cpu.cpp b/fbgemm_gpu/src/input_combine_ops/input_combine_cpu.cpp index ef97beba48..a37ba0bffc 100644 --- a/fbgemm_gpu/src/input_combine_ops/input_combine_cpu.cpp +++ b/fbgemm_gpu/src/input_combine_ops/input_combine_cpu.cpp @@ -27,7 +27,7 @@ using Tensor = at::Tensor; namespace fbgemm_gpu { -void _cat_int_tensors_out( +static void _cat_int_tensors_out( Tensor& combined_tensors, const std::vector& tensor_list, int64_t total_num, @@ -82,7 +82,7 @@ void _cat_int_tensors_out( } } -Tensor _cat_int_tensors( +static Tensor _cat_int_tensors( const std::vector& tensor_list, int64_t total_num, bool use_pin_memory, @@ -107,7 +107,7 @@ Tensor _cat_int_tensors( return combined_tensors; } -Tensor _cat_int_tensors_with_padding( +static Tensor _cat_int_tensors_with_padding( const std::vector& tensor_list, int64_t total_num, bool use_pin_memory, @@ -140,7 +140,7 @@ Tensor _cat_int_tensors_with_padding( return combined_tensors; } -void _cat_per_sample_weights_list_out( +static void _cat_per_sample_weights_list_out( Tensor& out, const std::vector& per_sample_weights, const std::vector& indices_list, @@ -178,7 +178,7 @@ void _cat_per_sample_weights_list_out( } } -Tensor _cat_per_sample_weights_list( +static Tensor _cat_per_sample_weights_list( const std::vector& per_sample_weights, const std::vector& indices_list, int64_t total_num, @@ -375,7 +375,7 @@ void tbe_input_combine_with_length_cpu_out( combined_per_sample_weights.resize_({0}); } -std::tuple tbe_input_combine_with_length_cpu( +static std::tuple tbe_input_combine_with_length_cpu( const std::vector& indices_list, const std::vector& lengths_list, const std::vector& per_sample_weights) { @@ -518,7 +518,7 @@ std::tuple padding_fused_tbe_input_combine_cpu( /// @param lengths_list list of lengths. /// @param per_sample_weights list of per_sample_weights /// @return tuple of combined indices, lengths, and per_sample_weights -std::tuple +static std::tuple padding_fused_tbe_input_combine_with_length_cpu( const std::vector& indices_list, const std::vector& lengths_list, diff --git a/fbgemm_gpu/src/input_combine_ops/input_combine_gpu.cpp b/fbgemm_gpu/src/input_combine_ops/input_combine_gpu.cpp index d357bf9076..a9c8a65c09 100644 --- a/fbgemm_gpu/src/input_combine_ops/input_combine_gpu.cpp +++ b/fbgemm_gpu/src/input_combine_ops/input_combine_gpu.cpp @@ -31,12 +31,12 @@ enum args_pos { }; template -uint64_t compute_num_uint64s(const uint64_t num_elements) { +static uint64_t compute_num_uint64s(const uint64_t num_elements) { const uint64_t ratio = sizeof(uint64_t) / sizeof(T); return (num_elements + ratio - 1) / ratio; } -void offset_tbe_input_combine_with_length_args( +static void offset_tbe_input_combine_with_length_args( uint64_t** indices_addrs, uint64_t** lengths_addrs, uint64_t** indices_offsets, @@ -59,7 +59,7 @@ void offset_tbe_input_combine_with_length_args( reinterpret_cast(base_addr + ptr_offsets[P_lengths_is_long]); } -std::tuple tbe_input_combine_with_length_gpu( +static std::tuple tbe_input_combine_with_length_gpu( const std::vector& indices_list, const std::vector& lengths_list, const std::vector& per_sample_weights) { diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp index ac14fdd975..44dd2c47be 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp +++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp @@ -788,7 +788,7 @@ class JaggedSliceOp : public torch::autograd::Function { } // namespace ///@ingroup jagged-tensor-ops-cpu -Tensor jagged_to_padded_dense_forward_autograd( +static Tensor jagged_to_padded_dense_forward_autograd( const Tensor& values, const std::vector& offsets, const c10::SymIntArrayRef max_lengths, @@ -883,7 +883,7 @@ std::tuple> dense_to_jagged( auto output = op.call(dense, offsets, total_L); return {output, offsets}; } -Tensor dense_to_jagged_forward_autograd( +static Tensor dense_to_jagged_forward_autograd( const Tensor& dense, const std::vector& offsets, std::optional total_L) { diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp index 4662c24181..c7905308e1 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp +++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp @@ -11,6 +11,9 @@ #include #include #include + +#include +#include #include "ATen/Parallel.h" #include "common.h" @@ -45,7 +48,7 @@ inline bool walk_down_tensor_storage_tree_except_last_( int j_temp = flattened_jagged_idx; #pragma unroll for (int d = NUM_JAGGED_DIM - 2; d >= 0; --d) { - const int jagged_size = jagged_dims[d + 1]; + const auto jagged_size = jagged_dims[d + 1]; jagged_coords[d] = j_temp % jagged_size; j_temp /= jagged_size; } @@ -53,8 +56,8 @@ inline bool walk_down_tensor_storage_tree_except_last_( bool is_zero = false; #pragma unroll for (int d = 0; d < NUM_JAGGED_DIM - 1; ++d) { - const int begin = x_offsets[d][offset]; - const int end = x_offsets[d][offset + 1]; + const auto begin = x_offsets[d][offset]; + const auto end = x_offsets[d][offset + 1]; if (jagged_coords[d] >= end - begin) { is_zero = true; break; @@ -122,7 +125,7 @@ void jagged_dense_elementwise_dense_output_kernel_( " != NUM_JAGGED_DIM, ", NUM_JAGGED_DIM); - const int outer_dense_size = y.size(0); + const auto outer_dense_size = y.size(0); TORCH_CHECK( outer_dense_size == x_offsets[0].numel() - 1, "outer_dense_size, ", @@ -131,7 +134,7 @@ void jagged_dense_elementwise_dense_output_kernel_( x_offsets[0].numel() - 1); TORCH_CHECK( !NO_INNER_DENSE || y.size(-1) == 1, "y.size(-1), ", y.size(-1), " != 1"); - const int inner_dense_size = NO_INNER_DENSE ? 1 : y.size(-1); + const auto inner_dense_size = NO_INNER_DENSE ? 1 : y.size(-1); TORCH_CHECK( inner_dense_size == x_values.size(-1), "inner_dense_size, ", @@ -143,9 +146,9 @@ void jagged_dense_elementwise_dense_output_kernel_( return; } - const int jagged_folded_size = - y.numel() / (outer_dense_size * inner_dense_size); - const int jagged_innermost_size = y.size(-2); + const auto jagged_folded_size = + y.numel() / (static_cast(outer_dense_size * inner_dense_size)); + const auto jagged_innermost_size = y.size(-2); // Canonicalize y and output to 3D, collapsing jagged dimensions. const Tensor y_reshaped = y.view({y.size(0), -1, y.size(-1)}); @@ -173,10 +176,10 @@ void jagged_dense_elementwise_dense_output_kernel_( // jagged dimension. int jiidx = 0; if (!is_zero) { - const int begin = x_offsets_accessors[NUM_JAGGED_DIM - 1][offset_base]; - const int end = + const auto begin = x_offsets_accessors[NUM_JAGGED_DIM - 1][offset_base]; + const auto end = x_offsets_accessors[NUM_JAGGED_DIM - 1][offset_base + 1]; - for (; jiidx < std::min(end - begin, jagged_innermost_size); ++jiidx) { + for (; jiidx < std::min(end - begin, static_cast(jagged_innermost_size)); ++jiidx) { int jidx = joidx * jagged_innermost_size + jiidx; if (NO_INNER_DENSE) { output_accessor[oidx][jidx][0] = @@ -227,7 +230,7 @@ void jagged_dense_elementwise_dense_output_( index_t>(x_values, x_offsets, y, output, f, padding_value); \ } - const int num_jagged_dim = y.dim() - 2; + const auto num_jagged_dim = y.dim() - 2; JAGGED_TENSOR_DISPATCH_DIMS(); #undef INVOKE_KERNEL_WITH_DIM @@ -269,7 +272,7 @@ void jagged_dense_elementwise_jagged_output_kernel_( " != NUM_JAGGED_DIM, ", NUM_JAGGED_DIM); - const int outer_dense_size = y.size(0); + const auto outer_dense_size = y.size(0); TORCH_CHECK( outer_dense_size == x_offsets[0].numel() - 1, "outer_dense_size, ", @@ -278,7 +281,7 @@ void jagged_dense_elementwise_jagged_output_kernel_( x_offsets[0].numel() - 1); TORCH_CHECK( !NO_INNER_DENSE || y.size(-1) == 1, "y.size(-1), ", y.size(-1), " != 1"); - const int inner_dense_size = NO_INNER_DENSE ? 1 : y.size(-1); + const auto inner_dense_size = NO_INNER_DENSE ? 1 : y.size(-1); TORCH_CHECK( inner_dense_size == x_values.size(-1), "inner_dense_size, ", @@ -290,9 +293,9 @@ void jagged_dense_elementwise_jagged_output_kernel_( return; } - const int jagged_folded_size = - y.numel() / (outer_dense_size * inner_dense_size); - const int jagged_innermost_size = y.size(-2); + const auto jagged_folded_size = + y.numel() / (static_cast(outer_dense_size * inner_dense_size)); + const auto jagged_innermost_size = y.size(-2); // Canonicalize y to 3D, collapsing jagged dimensions. Tensor y_reshaped = y.view({y.size(0), -1, y.size(-1)}); @@ -317,11 +320,11 @@ void jagged_dense_elementwise_jagged_output_kernel_( // As a perf optimization, a separate loop level for the inner-most // jagged dimension. if (!is_zero) { - const int begin = x_offsets_accessors[NUM_JAGGED_DIM - 1][offset_base]; - const int end = + const auto begin = x_offsets_accessors[NUM_JAGGED_DIM - 1][offset_base]; + const auto end = x_offsets_accessors[NUM_JAGGED_DIM - 1][offset_base + 1]; for (int jiidx = 0; - jiidx < std::min(end - begin, jagged_innermost_size); + jiidx < std::min(end - begin, static_cast(jagged_innermost_size)); ++jiidx) { int jidx = joidx * jagged_innermost_size + jiidx; if (NO_INNER_DENSE) { @@ -527,7 +530,7 @@ void jagged_jagged_elementwise_dense_output_kernel_( " != NUM_JAGGED_DIM, ", NUM_JAGGED_DIM); - const int outer_dense_size = output.size(0); + const auto outer_dense_size = output.size(0); TORCH_CHECK( outer_dense_size == x_offsets[0].numel() - 1, "outer_dense_size, ", @@ -535,7 +538,7 @@ void jagged_jagged_elementwise_dense_output_kernel_( " != x_offsets[0].numel() - 1, ", x_offsets[0].numel() - 1); TORCH_CHECK(!NO_INNER_DENSE || output.size(-1) == 1); - const int inner_dense_size = NO_INNER_DENSE ? 1 : output.size(-1); + const auto inner_dense_size = NO_INNER_DENSE ? 1 : output.size(-1); TORCH_CHECK( inner_dense_size == x_values.size(-1), "inner_dense_size, ", @@ -547,9 +550,9 @@ void jagged_jagged_elementwise_dense_output_kernel_( return; } - const int jagged_folded_size = - output.numel() / (outer_dense_size * inner_dense_size); - const int jagged_innermost_size = output.size(-2); + const auto jagged_folded_size = + output.numel() / (static_cast(outer_dense_size * inner_dense_size)); + const auto jagged_innermost_size = output.size(-2); // Canonicalize output to 3D, collapsing jagged dimensions. Tensor output_reshaped = output.view({output.size(0), -1, output.size(-1)}); @@ -576,10 +579,10 @@ void jagged_jagged_elementwise_dense_output_kernel_( // jagged dimension. int jiidx = 0; if (!is_zero) { - const int begin = x_offsets_accessors[NUM_JAGGED_DIM - 1][offset_base]; - const int end = + const auto begin = x_offsets_accessors[NUM_JAGGED_DIM - 1][offset_base]; + const auto end = x_offsets_accessors[NUM_JAGGED_DIM - 1][offset_base + 1]; - for (; jiidx < std::min(end - begin, jagged_innermost_size); ++jiidx) { + for (; jiidx < std::min(end - begin, static_cast(jagged_innermost_size)); ++jiidx) { int jidx = joidx * jagged_innermost_size + jiidx; if (NO_INNER_DENSE) { output_accessor[oidx][jidx][0] = @@ -628,7 +631,7 @@ void jagged_jagged_elementwise_dense_output_( index_t>(x_values, x_offsets, y_values, output, f, padding_value); \ } - const int num_jagged_dim = output.dim() - 2; + const auto num_jagged_dim = output.dim() - 2; JAGGED_TENSOR_DISPATCH_DIMS(); #undef INVOKE_KERNEL_WITH_DIM @@ -662,14 +665,14 @@ std::tuple jagged_dense_elementwise_mul_backward( x_offsets, y, x_values_grad, - [](scalar_t x, scalar_t y) -> scalar_t { return x * y; }); + std::multiplies()); jagged_jagged_elementwise_dense_output_( grad_output, x_offsets, x_values, y_grad, - [](scalar_t x, scalar_t y) -> scalar_t { return x * y; }); + std::multiplies()); }); return {x_values_grad, y_grad}; @@ -695,14 +698,14 @@ void dense_vec_jagged_2d_bmm( const pta::TensorAccessor& a_values, const pta::TensorAccessor& a_offsets, pta::TensorAccessor output) { - const int B = a_offsets.size(0) - 1; - const int H = v.size(0) / B; - const int max_L = v.size(1); - const int D = output.size(1); + const auto B = a_offsets.size(0) - 1; + const auto H = v.size(0) / B; + const auto max_L = v.size(1); + const auto D = output.size(1); for (const auto b : c10::irange(B)) { - const int row_start = a_offsets[b]; - const int row_end = a_offsets[b + 1]; - const int length = std::min(row_end - row_start, max_L); + const auto row_start = a_offsets[b]; + const auto row_end = a_offsets[b + 1]; + const auto length = std::min(row_end - row_start, static_cast(max_L)); if (length == 0) { for (const auto h : c10::irange(H)) { for (const auto d : c10::irange(D)) { @@ -732,14 +735,14 @@ void dense_vec_jagged_2d_transposed_bmm( const pta::TensorAccessor& a_values, const pta::TensorAccessor& a_offsets, pta::TensorAccessor output) { - const int B = a_offsets.size(0) - 1; - const int H = v.size(0) / B; - const int max_L = output.size(1); - const int D = v.size(1); + const auto B = a_offsets.size(0) - 1; + const auto H = v.size(0) / B; + const auto max_L = output.size(1); + const auto D = v.size(1); for (const auto b : c10::irange(B)) { - const int row_start = a_offsets[b]; - const int row_end = a_offsets[b + 1]; - const int length = std::min(row_end - row_start, max_L); + const auto row_start = a_offsets[b]; + const auto row_end = a_offsets[b + 1]; + const auto length = std::min(row_end - row_start, static_cast(max_L)); if (D == 0) { for (const auto h : c10::irange(H)) { @@ -772,16 +775,16 @@ void outer_prod_jagged_2d_output( const pta::TensorAccessor& y, const pta::TensorAccessor& offsets, pta::TensorAccessor output_values) { - const int B = offsets.size(0) - 1; - const int H = x.size(0) / B; - const int max_L = x.size(1); - const int D = y.size(1); + const auto B = offsets.size(0) - 1; + const auto H = x.size(0) / B; + const auto max_L = x.size(1); + const auto D = y.size(1); for (const auto b : c10::irange(B)) { - const int row_start = offsets[b]; - const int row_end = offsets[b + 1]; - const int length = row_end - row_start; + const auto row_start = offsets[b]; + const auto row_end = offsets[b + 1]; + const auto length = row_end - row_start; for (const auto h : c10::irange(H)) { - for (int l = 0; l < std::min(length, max_L); ++l) { + for (int l = 0; l < std::min(length, static_cast(max_L)); ++l) { for (const auto d : c10::irange(D)) { output_values[row_start + l][h * D + d] = x[b * H + h][l] * y[b * H + h][d]; @@ -799,16 +802,16 @@ Tensor batched_dense_vec_jagged_2d_mul_forward( TENSOR_ON_CPU(a_values); TENSOR_ON_CPU(a_offsets); - const int B = a_offsets.numel() - 1; + const auto B = a_offsets.numel() - 1; TORCH_CHECK( B == 0 || v.size(0) % B == 0, "B, ", B, " doesn't divide v.size(0), ", v.size(0)); - const int H = B == 0 ? 1 : v.size(0) / B; - const int D = a_values.size(-1) / H; - auto output = at::empty({B * H, D}, v.options()); + const auto H = B == 0 ? 1 : v.size(0) / B; + const auto D = a_values.size(-1) / H; + auto output = at::empty({static_cast(B * H), D}, v.options()); if (B > 0 && D > 0) { const auto func_name = "batched_dense_vec_jagged_2d_mul_forward"; @@ -838,8 +841,8 @@ std::tuple batched_dense_vec_jagged_2d_mul_backward( Tensor a_values_grad = at::zeros_like(a_values); Tensor v_grad = at::empty_like(v); - const int B = a_offsets.numel() - 1; - const int D = grad_output.size(-1); + const auto B = a_offsets.numel() - 1; + const auto D = grad_output.size(-1); if (B > 0 && D > 0) { const auto func_name = "batched_dense_vec_jagged_2d_mul_backward"; @@ -1046,7 +1049,7 @@ jagged_2d_to_dense_forward_cpu(Tensor values, Tensor offsets, int64_t max_L) { /*padding_value=*/0); } -std::vector stacked_jagged_1d_to_dense_cpu( +static std::vector stacked_jagged_1d_to_dense_cpu( Tensor values, Tensor lengths, const std::vector& offset_per_key, @@ -1066,7 +1069,7 @@ std::vector stacked_jagged_1d_to_dense_cpu( AT_DISPATCH_INDEX_TYPES( lengths_contig.scalar_type(), "length_to_offset_cpu_kernel", [&] { index_t cumsum = 0; - const auto* input_ptr = &(lengths_contig.data_ptr()[t * B]); + const auto* input_ptr = &(lengths_contig.data_ptr()[static_cast(t * B)]); auto* output_ptr = offsets.data_ptr() + 1; for (const auto i : c10::irange(B)) { cumsum += input_ptr[i]; @@ -1105,7 +1108,7 @@ std::vector stacked_jagged_2d_to_dense_cpu( AT_DISPATCH_INDEX_TYPES( lengths_contig.scalar_type(), "length_to_offset_cpu_kernel", [&] { index_t cumsum = 0; - const auto* input_ptr = &(lengths_contig.data_ptr()[t * B]); + const auto* input_ptr = &(lengths_contig.data_ptr()[static_cast(t * B)]); auto* output_ptr = offsets.data_ptr() + 1; for (const auto i : c10::irange(B)) { cumsum += input_ptr[i]; @@ -1301,7 +1304,7 @@ void jagged_index_add_2d_kernel( auto& lock = locks[output_offset]; while (lock.test_and_set(std::memory_order_acquire)) { // For C++20 -#if defined(__cpp_lib_atomic_flag_test) +#ifdef __cpp_lib_atomic_flag_test while (lock.test(std::memory_order_relaxed)) #endif ; @@ -1361,12 +1364,12 @@ void jagged_softmax_kernel( const at::TensorAccessor& offsets, at::TensorAccessor output, const int64_t max_L) { - const int B = offsets.size(0) - 1; - const int D = values.size(1); + const auto B = offsets.size(0) - 1; + const auto D = values.size(1); for (const auto b : c10::irange(B)) { - const int row_start = offsets[b]; - const int row_end = offsets[b + 1]; - const int length = std::min(row_end - row_start, static_cast(max_L)); + const auto row_start = offsets[b]; + const auto row_end = offsets[b + 1]; + const auto length = std::min(row_end - row_start, static_cast(max_L)); if (length == 0) continue; @@ -1396,8 +1399,8 @@ Tensor jagged_softmax_forward( const int64_t max_L) { TENSOR_ON_CPU(values); TENSOR_ON_CPU(offsets); - const int B = offsets.numel() - 1; - const int D = values.size(1); + const auto B = offsets.numel() - 1; + const auto D = values.size(1); auto output = at::empty_like(values); if (B > 0 && D > 0) { @@ -1423,12 +1426,12 @@ void jagged_softmax_backward_kernel( const at::TensorAccessor& offsets, at::TensorAccessor grad_input, const int64_t max_L) { - const int B = offsets.size(0) - 1; - const int D = grad_output.size(1); + const auto B = offsets.size(0) - 1; + const auto D = grad_output.size(1); for (const auto b : c10::irange(B)) { - const int row_start = offsets[b]; - const int row_end = offsets[b + 1]; - const int length = std::min(row_end - row_start, static_cast(max_L)); + const auto row_start = offsets[b]; + const auto row_end = offsets[b + 1]; + const auto length = std::min(row_end - row_start, static_cast(max_L)); if (length == 0) continue; for (const auto d : c10::irange(D)) { @@ -1454,8 +1457,8 @@ Tensor jagged_softmax_backward( TENSOR_ON_CPU(grad_output); TENSOR_ON_CPU(output); TENSOR_ON_CPU(offsets); - const int B = offsets.numel() - 1; - const int D = grad_output.size(1); + const auto B = offsets.numel() - 1; + const auto D = grad_output.size(1); auto grad_input = at::empty_like(grad_output); if (B > 0 && D > 0) { @@ -1484,13 +1487,13 @@ void jagged_jagged_bmm_kernel( const at::TensorAccessor& offsets, at::TensorAccessor output, const int64_t max_L) { - const int B = offsets.size(0) - 1; - const int M = x_values.size(1); - const int N = y_values.size(1); + const auto B = offsets.size(0) - 1; + const auto M = x_values.size(1); + const auto N = y_values.size(1); for (const auto b : c10::irange(B)) { - const int row_start = offsets[b]; - const int row_end = offsets[b + 1]; - const int length = std::min(row_end - row_start, static_cast(max_L)); + const auto row_start = offsets[b]; + const auto row_end = offsets[b + 1]; + const auto length = std::min(row_end - row_start, static_cast(max_L)); for (const auto m : c10::irange(M)) { for (const auto n : c10::irange(N)) { at::acc_type acc = 0; @@ -1511,9 +1514,9 @@ Tensor jagged_jagged_bmm_forward( TENSOR_ON_CPU(x_values); TENSOR_ON_CPU(y_values); TENSOR_ON_CPU(offsets); - const int B = offsets.size(0) - 1; - const int M = x_values.size(-1); - const int N = y_values.size(-1); + const auto B = offsets.size(0) - 1; + const auto M = x_values.size(-1); + const auto N = y_values.size(-1); auto output = at::zeros({B, M, N}, x_values.options()); if (B > 0 && M > 0 && N > 0) { AT_DISPATCH_INDEX_TYPES( @@ -1541,13 +1544,13 @@ void jagged_dense_bmm_kernel( at::TensorAccessor output, const int64_t max_L) { // [sum_B, K] x [B, K, N] -> [B, L, N] -> [sum_B, N] - const int B = x_offsets.size(0) - 1; - const int K = x_values.size(1); - const int N = y.size(2); + const auto B = x_offsets.size(0) - 1; + const auto K = x_values.size(1); + const auto N = y.size(2); for (const auto b : c10::irange(B)) { - const int row_start = x_offsets[b]; - const int row_end = x_offsets[b + 1]; - const int length = std::min(row_end - row_start, static_cast(max_L)); + const auto row_start = x_offsets[b]; + const auto row_end = x_offsets[b + 1]; + const auto length = std::min(row_end - row_start, static_cast(max_L)); for (const auto l : c10::irange(length)) { for (const auto n : c10::irange(N)) { at::acc_type acc = 0; @@ -1568,10 +1571,10 @@ Tensor jagged_dense_bmm_forward( TENSOR_ON_CPU(x_values); TENSOR_ON_CPU(x_offsets); TENSOR_ON_CPU(y); - const int B = x_offsets.size(0) - 1; - const int M = x_values.size(-1); - const int N = y.size(-1); - const int total_L = x_values.size(0); + const auto B = x_offsets.size(0) - 1; + const auto M = x_values.size(-1); + const auto N = y.size(-1); + const auto total_L = x_values.size(0); auto output = at::zeros({total_L, N}, x_values.options()); if (B > 0 && M > 0 && N > 0) { AT_DISPATCH_INDEX_TYPES( diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp index 6afb07cfac..e8c48d0ab7 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp +++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp @@ -20,7 +20,7 @@ namespace fbgemm_gpu { using Tensor = at::Tensor; ///@ingroup jagged-tensor-ops-meta -Tensor jagged_to_padded_dense_forward_meta( +static Tensor jagged_to_padded_dense_forward_meta( const Tensor& values, const std::vector& offsets, c10::SymIntArrayRef max_lengths, @@ -42,7 +42,7 @@ Tensor jagged_to_padded_dense_forward_meta( return at::empty_symint(padded_values_shape, values.options()); } -Tensor jagged_to_padded_dense_meta( +static Tensor jagged_to_padded_dense_meta( const Tensor& values, const std::vector& offsets, const c10::SymIntArrayRef max_lengths, @@ -51,7 +51,7 @@ Tensor jagged_to_padded_dense_meta( values, offsets, max_lengths, padding_value); } -Tensor jagged_to_padded_dense_backward_meta( +static Tensor jagged_to_padded_dense_backward_meta( const at::Tensor& grad_output, const std::vector& offsets, at::SymInt total_L) { @@ -70,7 +70,7 @@ Tensor jagged_to_padded_dense_backward_meta( return D_folded ? grad_values.squeeze(-1) : grad_values; } -Tensor jagged_dense_dense_elementwise_add_jagged_output_forward_meta( +static Tensor jagged_dense_dense_elementwise_add_jagged_output_forward_meta( const at::Tensor& x_values, const std::vector& /*x_offsets*/, const at::Tensor& y_0, @@ -79,7 +79,7 @@ Tensor jagged_dense_dense_elementwise_add_jagged_output_forward_meta( return at::empty_like(x_values); } -std::tuple> +static std::tuple> jagged_dense_dense_elementwise_add_jagged_output_meta( const at::Tensor& x_values, const std::vector& x_offsets, @@ -89,28 +89,28 @@ jagged_dense_dense_elementwise_add_jagged_output_meta( return {at::empty_like(x_values), x_offsets}; } -Tensor jagged_dense_elementwise_add_meta( +static Tensor jagged_dense_elementwise_add_meta( const Tensor& /* unused x_values */, const std::vector& /*x_offsets*/, const Tensor& y) { return at::empty_like(y); } -std::tuple> jagged_dense_elementwise_mul_meta( +static std::tuple> jagged_dense_elementwise_mul_meta( const Tensor& x_values, const std::vector& x_offsets, const Tensor& /* unused y */) { return {at::empty_like(x_values), x_offsets}; } -Tensor jagged_dense_elementwise_mul_forward_meta( +static Tensor jagged_dense_elementwise_mul_forward_meta( const Tensor& x_values, const std::vector& /*x_offsets*/, const Tensor& /*y*/) { return at::empty_like(x_values); } -std::tuple jagged_dense_elementwise_mul_backward_meta( +static std::tuple jagged_dense_elementwise_mul_backward_meta( const Tensor& grad_output, const std::vector& /*x_offsets*/, const Tensor& y, @@ -121,7 +121,7 @@ std::tuple jagged_dense_elementwise_mul_backward_meta( return {x_values_grad, y_grad}; } -std::tuple> +static std::tuple> jagged_dense_elementwise_add_jagged_output_meta( const at::Tensor& x_values, const std::vector& x_offsets, @@ -129,7 +129,7 @@ jagged_dense_elementwise_add_jagged_output_meta( return {at::empty_like(x_values), x_offsets}; } -Tensor batched_dense_vec_jagged_2d_mul_forward_meta( +static Tensor batched_dense_vec_jagged_2d_mul_forward_meta( const Tensor& v, const Tensor& a_values, const Tensor& a_offsets) { @@ -145,14 +145,14 @@ Tensor batched_dense_vec_jagged_2d_mul_forward_meta( return at::empty_symint({B * H, D}, v.options()); } -Tensor batched_dense_vec_jagged_2d_mul_meta( +static Tensor batched_dense_vec_jagged_2d_mul_meta( const Tensor& v, const Tensor& a_values, const Tensor& a_offsets) { return batched_dense_vec_jagged_2d_mul_forward_meta(v, a_values, a_offsets); } -std::tuple batched_dense_vec_jagged_2d_mul_backward_meta( +static std::tuple batched_dense_vec_jagged_2d_mul_backward_meta( const Tensor& /*grad_output*/, const Tensor& v, const Tensor& a_values, @@ -162,7 +162,7 @@ std::tuple batched_dense_vec_jagged_2d_mul_backward_meta( return {v_grad, a_values_grad}; } -Tensor jagged_dense_bmm_forward_meta( +static Tensor jagged_dense_bmm_forward_meta( const Tensor& x_values, const Tensor& /*x_offsets*/, const Tensor& y, @@ -172,14 +172,14 @@ Tensor jagged_dense_bmm_forward_meta( return at::zeros_symint({total_L, N}, x_values.options()); } -Tensor jagged_softmax_forward_meta( +static Tensor jagged_softmax_forward_meta( const Tensor& values, const Tensor& /*offsets*/, const int64_t /*max_L*/) { return at::empty_like(values); } -Tensor jagged_jagged_bmm_forward_meta( +static Tensor jagged_jagged_bmm_forward_meta( const Tensor& x_values, const Tensor& y_values, const Tensor& offsets, @@ -191,7 +191,7 @@ Tensor jagged_jagged_bmm_forward_meta( return output; } -Tensor jagged_softmax_backward_meta( +static Tensor jagged_softmax_backward_meta( const Tensor& grad_output, const Tensor& /*output*/, const Tensor& /*offsets*/, @@ -219,7 +219,7 @@ Tensor jagged_2d_to_dense_meta( /*padding_value=*/0); } -Tensor get_source_mask_meta( +static Tensor get_source_mask_meta( const Tensor& num_sources, const Tensor& num_targets, const int64_t output_size) { diff --git a/fbgemm_gpu/src/layout_transform_ops/layout_transform_ops_cpu.cpp b/fbgemm_gpu/src/layout_transform_ops/layout_transform_ops_cpu.cpp index a01de3a673..6b036a1984 100644 --- a/fbgemm_gpu/src/layout_transform_ops/layout_transform_ops_cpu.cpp +++ b/fbgemm_gpu/src/layout_transform_ops/layout_transform_ops_cpu.cpp @@ -21,7 +21,7 @@ using Tensor = at::Tensor; namespace fbgemm_gpu { ///@ingroup layout-transform-cpu -Tensor recat_embedding_grad_output_mixed_D_cpu( +static Tensor recat_embedding_grad_output_mixed_D_cpu( const Tensor& grad_output, // [B_local][Sum_T_global(D)] const std::vector& dim_sum_per_rank) { TORCH_CHECK(grad_output.is_contiguous()); diff --git a/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp b/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp index 908de23eff..3541b596d7 100644 --- a/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp +++ b/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp @@ -18,7 +18,7 @@ using Tensor = at::Tensor; namespace fbgemm_gpu { -Tensor merge_pooled_embeddings_cpu( +static Tensor merge_pooled_embeddings_cpu( std::vector pooled_embeddings, int64_t /*uncat_dim_size*/, at::Device target_device, @@ -49,7 +49,7 @@ Tensor merge_pooled_embeddings_cpu( return result; } -Tensor sum_reduce_to_one_cpu( +static Tensor sum_reduce_to_one_cpu( std::vector input_tensors, at::Device /* target_device */) { TORCH_CHECK(!input_tensors.empty()); @@ -64,7 +64,7 @@ Tensor sum_reduce_to_one_cpu( return result; } -std::vector all_to_one_device_cpu( +static std::vector all_to_one_device_cpu( std::vector input_tensors, at::Device /* target_device */) { for (const auto& t : input_tensors) { diff --git a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp index 172aa04ae8..60b7d2b6ee 100644 --- a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp +++ b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp @@ -88,7 +88,7 @@ at::Tensor permute_duplicate_pooled_embs_cpu( } ///@ingroup permute-pooled-embs-cpu -at::Tensor permute_pooled_embs_auto_grad( +static at::Tensor permute_pooled_embs_auto_grad( const Tensor& pooled_embs, const Tensor& offset_dim_list, const Tensor& permute_list, @@ -104,7 +104,7 @@ at::Tensor permute_pooled_embs_auto_grad( } ///@ingroup permute-pooled-embs-cpu -at::Tensor permute_pooled_embs_auto_grad_cpu( +static at::Tensor permute_pooled_embs_auto_grad_cpu( const Tensor& pooled_embs, const Tensor& offset_dim_list, const Tensor& permute_list, @@ -120,7 +120,7 @@ at::Tensor permute_pooled_embs_auto_grad_cpu( } ///@ingroup permute-duplicate-pooled-embs-cpu -at::Tensor permute_duplicate_pooled_embs_auto_grad_cpu( +static at::Tensor permute_duplicate_pooled_embs_auto_grad_cpu( const Tensor& pooled_embs, const Tensor& offset_dim_list, const Tensor& permute_list, @@ -135,7 +135,7 @@ at::Tensor permute_duplicate_pooled_embs_auto_grad_cpu( true); } -at::Tensor permute_pooled_embs_meta( +static at::Tensor permute_pooled_embs_meta( const Tensor& pooled_embs, const Tensor& /* offset_dim_list */, const Tensor& /* permute_list */, @@ -144,7 +144,7 @@ at::Tensor permute_pooled_embs_meta( return torch::empty_like(pooled_embs); } -at::Tensor permute_pooled_embs_auto_grad_meta( +static at::Tensor permute_pooled_embs_auto_grad_meta( const Tensor& pooled_embs, const Tensor& /* offset_dim_list */, const Tensor& /* permute_list */, diff --git a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp index 61e9627482..8aeeb9d88b 100644 --- a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp +++ b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp @@ -19,7 +19,7 @@ using Tensor = at::Tensor; namespace fbgemm_gpu { ///@ingroup permute-pooled-embs-gpu -Tensor permute_pooled_embs_auto_grad_gpu( +static Tensor permute_pooled_embs_auto_grad_gpu( const Tensor& pooled_embs, const Tensor& offset_dim_list, const Tensor& permute_list, @@ -34,7 +34,7 @@ Tensor permute_pooled_embs_auto_grad_gpu( } ///@ingroup permute-duplicate-pooled-embs-gpu -Tensor permute_duplicate_pooled_embs_auto_grad_gpu( +static Tensor permute_duplicate_pooled_embs_auto_grad_gpu( const Tensor& pooled_embs, const Tensor& offset_dim_list, const Tensor& permute_list, diff --git a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp index d60360c5a1..1eee2f06d8 100644 --- a/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp +++ b/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp @@ -25,7 +25,7 @@ using torch::autograd::AutogradContext; using torch::autograd::Variable; using torch::autograd::variable_list; -Tensor permute_pooled_embs_split_cpu_impl( +static Tensor permute_pooled_embs_split_cpu_impl( const Tensor& pooled_embs, // [B_local][Sum_T_global(D)] const Tensor& offset_dim_list, const Tensor& permute_list, @@ -80,7 +80,7 @@ Tensor permute_pooled_embs_split_cpu( false); } -Tensor permute_duplicate_pooled_embs_split_cpu( +static Tensor permute_duplicate_pooled_embs_split_cpu( const Tensor& pooled_embs, // [B_local][Sum_T_global(D)] const Tensor& offset_dim_list, const Tensor& permute_list, @@ -95,7 +95,7 @@ Tensor permute_duplicate_pooled_embs_split_cpu( true); } -Tensor permute_pooled_embs_split_dispatch_call( +static Tensor permute_pooled_embs_split_dispatch_call( const Tensor& pooled_embs, // [B_local][Sum_T_global(D)] const Tensor& offset_dim_list, const Tensor& permute_list, @@ -113,7 +113,7 @@ Tensor permute_pooled_embs_split_dispatch_call( inv_permute_list); } -Tensor permute_duplicate_pooled_embs_split_dispatch_call( +static Tensor permute_duplicate_pooled_embs_split_dispatch_call( const Tensor& pooled_embs, const Tensor& offset_dim_list, const Tensor& permute_list, @@ -148,7 +148,7 @@ Tensor permute_pooled_embs_auto_grad_split_cpu( inv_permute_list); } -Tensor permute_duplicate_pooled_embs_auto_grad_split_cpu( +static Tensor permute_duplicate_pooled_embs_auto_grad_split_cpu( const Tensor& pooled_embs, const Tensor& offset_dim_list, const Tensor& permute_list, diff --git a/fbgemm_gpu/src/quantize_ops/quantize_ops_cpu.cpp b/fbgemm_gpu/src/quantize_ops/quantize_ops_cpu.cpp index 65e93e474e..4a1169a03b 100644 --- a/fbgemm_gpu/src/quantize_ops/quantize_ops_cpu.cpp +++ b/fbgemm_gpu/src/quantize_ops/quantize_ops_cpu.cpp @@ -26,7 +26,7 @@ using Tensor = at::Tensor; namespace fbgemm_gpu { template -Tensor& _float_to_fused8bitrowwise_cpu_out_t( +static Tensor& _float_to_fused8bitrowwise_cpu_out_t( Tensor& output, const Tensor& input) { TENSOR_ON_CPU(input); @@ -56,7 +56,7 @@ Tensor& _float_to_fused8bitrowwise_cpu_out_t( } template -Tensor& _fused8bitrowwise_to_float_cpu_out_t( +static Tensor& _fused8bitrowwise_to_float_cpu_out_t( Tensor& output, const Tensor& input, const bool scale_bias_last, @@ -100,7 +100,7 @@ Tensor& _fused8bitrowwise_to_float_cpu_out_t( } template -Tensor _float_to_fusednbitrowwise_cpu( +static Tensor _float_to_fusednbitrowwise_cpu( const Tensor& input, const int64_t bit_rate, const input_t* rowwise_min_max = nullptr) { @@ -136,7 +136,7 @@ Tensor _float_to_fusednbitrowwise_cpu( } template -Tensor _fusednbitrowwise_to_float_cpu( +static Tensor _fusednbitrowwise_to_float_cpu( const Tensor& input, const int64_t bit_rate) { TENSOR_ON_CPU(input); @@ -173,7 +173,7 @@ Tensor _fusednbitrowwise_to_float_cpu( // Both float16 and bfloat16 are of same type uint16_t template -Tensor _fusednbitrowwise_sbfront_to_float_or_half_cpu( +static Tensor _fusednbitrowwise_sbfront_to_float_or_half_cpu( const Tensor& input, const int64_t bit_rate) { TENSOR_ON_CPU(input); @@ -264,7 +264,7 @@ Tensor& _float_to_fused8bitrowwise_cpu_out( return _float_to_fused8bitrowwise_cpu_out_t(output, input); } -Tensor& _half_to_fused8bitrowwise_cpu_out(Tensor& output, const Tensor& input) { +static Tensor& _half_to_fused8bitrowwise_cpu_out(Tensor& output, const Tensor& input) { return _float_to_fused8bitrowwise_cpu_out_t(output, input); } @@ -395,7 +395,7 @@ Tensor fusednbitrowwise_to_float_cpu( /// float32. The input tensor should have torch.quint4x2 or torch.quint2x4 dtype /// and QuantizedCPU backend. This operator is only recommended for testing /// purpose because its kernel is reference implementation and not optimized. -Tensor fusednbitrowwise_sbfront_to_float_cpu( +static Tensor fusednbitrowwise_sbfront_to_float_cpu( const Tensor& input, const int64_t bit_rate, const int64_t output_dtype) { @@ -512,7 +512,7 @@ static Tensor float_or_half_to_fusednbitrowwise_cpu_with_rowwise_min_max( /// @ingroup quantize-data-cpu /// -void FloatToFP8Quantized_ref( +static void FloatToFP8Quantized_ref( const float* const input, const size_t nrows, const size_t ncols, @@ -533,7 +533,7 @@ void FloatToFP8Quantized_ref( /// @ingroup quantize-data-cpu /// -void FP8QuantizedToFloat_ref( +static void FP8QuantizedToFloat_ref( const uint8_t* const input, const size_t nrows, const size_t ncols, diff --git a/fbgemm_gpu/src/quantize_ops/quantize_ops_meta.cpp b/fbgemm_gpu/src/quantize_ops/quantize_ops_meta.cpp index a1b4ea2f86..5bc19f0a16 100644 --- a/fbgemm_gpu/src/quantize_ops/quantize_ops_meta.cpp +++ b/fbgemm_gpu/src/quantize_ops/quantize_ops_meta.cpp @@ -21,7 +21,7 @@ namespace fbgemm_gpu { /// @ingroup quantize-data-meta /// -Tensor FP8rowwise_to_float_meta( +static Tensor FP8rowwise_to_float_meta( const Tensor& input, [[maybe_unused]] bool forward, const int64_t output_dtype) { @@ -52,7 +52,7 @@ Tensor FP8rowwise_to_float_meta( /// @ingroup quantize-data-meta /// -Tensor FloatToFP8RowwiseQuantized_meta(const Tensor& input, bool /*forward*/) { +static Tensor FloatToFP8RowwiseQuantized_meta(const Tensor& input, bool /*forward*/) { TORCH_CHECK(input.is_contiguous(), "input must be contiguous"); const at::SymIntArrayRef input_sizes = input.sym_sizes(); @@ -69,7 +69,7 @@ Tensor FloatToFP8RowwiseQuantized_meta(const Tensor& input, bool /*forward*/) { /// @ingroup quantize-data-meta /// -Tensor fusednbitrowwise_to_float_or_half_meta( +static Tensor fusednbitrowwise_to_float_or_half_meta( const Tensor& input, const int64_t bit_rate, const int64_t output_dtype, diff --git a/fbgemm_gpu/src/sparse_ops/sparse_async_batched_cumsum.cpp b/fbgemm_gpu/src/sparse_ops/sparse_async_batched_cumsum.cpp index 1c0d115aa4..25e2079612 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_async_batched_cumsum.cpp +++ b/fbgemm_gpu/src/sparse_ops/sparse_async_batched_cumsum.cpp @@ -25,7 +25,7 @@ at::Tensor asynchronous_batched_complete_cumsum_cpu(const at::Tensor& t_in) { return output; } -at::Tensor asynchronous_batched_complete_cumsum_meta(const at::Tensor& values) { +static at::Tensor asynchronous_batched_complete_cumsum_meta(const at::Tensor& values) { auto B = values.sym_size(0); auto len = values.sym_size(1); auto output = at::native::empty_meta_symint( diff --git a/fbgemm_gpu/src/sparse_ops/sparse_async_cumsum.cpp b/fbgemm_gpu/src/sparse_ops/sparse_async_cumsum.cpp index 3cf7fd96ca..605f8a0a6b 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_async_cumsum.cpp +++ b/fbgemm_gpu/src/sparse_ops/sparse_async_cumsum.cpp @@ -22,7 +22,7 @@ namespace fbgemm_gpu { // 1D exclusive scan: output[i] = input[i-1] + input[i-2] + input[i-3] // Used as a helper to several functions below. template -U exclusive_scan_ptrs_cpu( +static U exclusive_scan_ptrs_cpu( const int64_t N, const T* const input, U* const output) { diff --git a/fbgemm_gpu/src/split_embeddings_cache/raw_embedding_streamer.cpp b/fbgemm_gpu/src/split_embeddings_cache/raw_embedding_streamer.cpp index 09107ccfc4..c0b80ac5b4 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/raw_embedding_streamer.cpp +++ b/fbgemm_gpu/src/split_embeddings_cache/raw_embedding_streamer.cpp @@ -199,13 +199,13 @@ RawEmbeddingStreamer::~RawEmbeddingStreamer() { } void RawEmbeddingStreamer::stream( - [[maybe_unused]] const at::Tensor& indices, - [[maybe_unused]] const at::Tensor& weights, - [[maybe_unused]] std::optional identities, - [[maybe_unused]] std::optional runtime_meta, - [[maybe_unused]] const at::Tensor& count, - [[maybe_unused]] bool require_tensor_copy, - [[maybe_unused]] bool blocking_tensor_copy) { + const at::Tensor& /*indices*/, + const at::Tensor& /*weights*/, + std::optional /*identities*/, + std::optional /*runtime_meta*/, + const at::Tensor& /*count*/, + bool /*require_tensor_copy*/, + bool /*blocking_tensor_copy*/) { if (!enable_raw_embedding_streaming_) { return; } diff --git a/fbgemm_gpu/src/tbe/eeg/eeg_models.h b/fbgemm_gpu/src/tbe/eeg/eeg_models.h index 939713f101..b471b1e3c0 100644 --- a/fbgemm_gpu/src/tbe/eeg/eeg_models.h +++ b/fbgemm_gpu/src/tbe/eeg/eeg_models.h @@ -59,10 +59,10 @@ struct IndicesDistributionParameters { // Max index value in the distribution - should be in the range [0, E), where // E is the number of rows in the embedding table - int64_t maxIndex; + int64_t maxIndex{}; // Number of indices to generate - int64_t numIndices; + int64_t numIndices{}; IndicesDistributionParameters( const std::vector& _1, @@ -81,7 +81,7 @@ DECL_OSTREAM_OUT(IndicesDistributionParameters); struct TBEBatchStats { // batch size, i.e., number of lookups - int64_t B; + int64_t B{}; // Standard deviation of B (for variable batch size configuration) std::optional sigmaB; @@ -106,7 +106,7 @@ DECL_OSTREAM_OUT(TBEIndicesStats); struct TBEPoolingStats { // Bag size, i.e., pooling factor - int64_t L; + int64_t L{}; // Standard deviation of L(for variable bag size configuration) std::optional sigmaL; @@ -117,11 +117,11 @@ DECL_OSTREAM_OUT(TBEPoolingStats); struct TBEAnalysisStats { // Number of tables - int64_t T; + int64_t T{}; // Number of rows in the embedding table - int64_t E; + int64_t E{}; // Embedding dimension (number of columns) - int64_t D; + int64_t D{}; // Batch stats TBEBatchStats batch; // Indices stats diff --git a/fbgemm_gpu/src/topology_utils.cpp b/fbgemm_gpu/src/topology_utils.cpp index 7ad513819d..240caaf8cf 100644 --- a/fbgemm_gpu/src/topology_utils.cpp +++ b/fbgemm_gpu/src/topology_utils.cpp @@ -152,7 +152,7 @@ AdjacencyMatrix get_nvlink_matrix() { NVML_CHECK( nvmlDeviceGetHandleByIndex(cuda_device_to_nvml_device[i], &handle)); for (const auto link : c10::irange(NVML_NVLINK_MAX_LINKS)) { - nvmlEnableState_t is_active; + nvmlEnableState_t is_active{NVML_FEATURE_DISABLED}; auto nvmlRet = nvmlDeviceGetNvLinkState(handle, link, &is_active); if (nvmlRet == NVML_ERROR_INVALID_ARGUMENT || nvmlRet == NVML_ERROR_NOT_SUPPORTED) {