pytorch · cyyever · Dec 23, 2025
diff --git a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache/raw_embedding_streamer.h b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache/raw_embedding_streamer.h
@@ -22,18 +22,14 @@ struct StreamQueueItem {
   std::optional<at::Tensor> identities;
   std::optional<at::Tensor> runtime_meta;
   at::Tensor count;
-  StreamQueueItem(
-      at::Tensor src_indices,
-      at::Tensor src_weights,
-      std::optional<at::Tensor> src_identities,
-      std::optional<at::Tensor> src_runtime_meta,
-      at::Tensor src_count) {
-    indices = std::move(src_indices);
-    weights = std::move(src_weights);
-    identities = std::move(src_identities);
-    runtime_meta = std::move(src_runtime_meta);
-    count = std::move(src_count);
-  }
+  StreamQueueItem(at::Tensor src_indices, at::Tensor src_weights,
+                  std::optional<at::Tensor> src_identities,
+                  std::optional<at::Tensor> src_runtime_meta,
+                  at::Tensor src_count)
+      : indices(std::move(src_indices)), count(std::move(src_count)),
+        identities(std::move(src_identities)),
+        runtime_meta(std::move(src_runtime_meta)),
+        weights(std::move(src_weights)) {}
 };
 
 class RawEmbeddingStreamer : public torch::jit::CustomClassHolder {

diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/tensor_accessor.h b/fbgemm_gpu/include/fbgemm_gpu/utils/tensor_accessor.h
@@ -327,7 +327,7 @@ class PackedTensorAccessor
   }
 
  protected:
-  size_t numel_;
+  size_t numel_{};
   char name_[NAME_MAX_LEN];
   char context_[CONTEXT_MAX_LEN];
 

diff --git a/fbgemm_gpu/src/config/feature_gates.cpp b/fbgemm_gpu/src/config/feature_gates.cpp
@@ -31,7 +31,7 @@ std::string to_string(const FeatureGateName& value) {
   return "UNKNOWN";
 }
 
-bool ev_check_key(const std::string& key) {
+static bool ev_check_key(const std::string& key) {
   const auto env_var = "FBGEMM_" + key;
 
   const auto value = std::getenv(env_var.c_str());

diff --git a/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
@@ -23,7 +23,7 @@ using Tensor = at::Tensor;
 namespace fbgemm_gpu {
 
 template <typename index_t>
-void embedding_inplace_update_cpu_kernel(
+static void embedding_inplace_update_cpu_kernel(
     at::TensorAccessor<uint8_t, 1> dev_weights,
     at::TensorAccessor<uint8_t, 1> uvm_weights,
     const at::TensorAccessor<int32_t, 1>& weights_placements,
@@ -151,9 +151,9 @@ void dram_kv_embedding_inplace_update_cpu(
 
   const uint8_t* weights_tys_ptr = weights_tys.data_ptr<uint8_t>();
   const int32_t* D_offsets_ptr = D_offsets.data_ptr<int32_t>();
-  const uint8_t* update_weights_ptr = update_weights.data_ptr<uint8_t>();
+  uint8_t* update_weights_ptr = update_weights.mutable_data_ptr<uint8_t>();
   const int32_t* update_table_idx_ptr = update_table_idx.data_ptr<int32_t>();
-  const int64_t* update_row_idx_ptr = update_row_idx.data_ptr<int64_t>();
+  int64_t* update_row_idx_ptr = update_row_idx.mutable_data_ptr<int64_t>();
   const int64_t* update_offsets_ptr = update_offsets.data_ptr<int64_t>();
 
   int64_t window_start = 0;
@@ -172,15 +172,13 @@ void dram_kv_embedding_inplace_update_cpu(
     int32_t D_bytes =
         nbit::padded_row_size_in_bytes(D, weight_ty, row_alignment);
 
-    uint8_t* batched_weights_ptr = const_cast<uint8_t*>(
-        update_weights_ptr + update_offsets_ptr[window_start]);
+    uint8_t* batched_weights_ptr = update_weights_ptr + update_offsets_ptr[window_start];
     auto weights_tensor = at::from_blob(
         batched_weights_ptr,
         {window_size, D_bytes},
         at::TensorOptions().dtype(at::kByte));
 
-    int64_t* row_ids_ptr =
-        const_cast<int64_t*>(update_row_idx_ptr + window_start);
+    int64_t* row_ids_ptr = update_row_idx_ptr + window_start;
     auto row_id_tensor = at::from_blob(
         row_ids_ptr, {window_size}, at::TensorOptions().dtype(at::kLong));
 

diff --git a/fbgemm_gpu/src/faster_hash_ops/faster_hash.cpp b/fbgemm_gpu/src/faster_hash_ops/faster_hash.cpp
@@ -263,7 +263,7 @@ std::tuple<Tensor, Tensor> create_zch_buffer_cpu(
   return {identity, metadata};
 }
 
-void zero_collision_hash_cpu_out(
+static void zero_collision_hash_cpu_out(
     Tensor& output,
     const Tensor& input,
     const Tensor& identities,

diff --git a/fbgemm_gpu/src/input_combine_ops/input_combine_cpu.cpp b/fbgemm_gpu/src/input_combine_ops/input_combine_cpu.cpp
@@ -27,7 +27,7 @@ using Tensor = at::Tensor;
 
 namespace fbgemm_gpu {
 
-void _cat_int_tensors_out(
+static void _cat_int_tensors_out(
     Tensor& combined_tensors,
     const std::vector<Tensor>& tensor_list,
     int64_t total_num,
@@ -82,7 +82,7 @@ void _cat_int_tensors_out(
   }
 }
 
-Tensor _cat_int_tensors(
+static Tensor _cat_int_tensors(
     const std::vector<Tensor>& tensor_list,
     int64_t total_num,
     bool use_pin_memory,
@@ -107,7 +107,7 @@ Tensor _cat_int_tensors(
   return combined_tensors;
 }
 
-Tensor _cat_int_tensors_with_padding(
+static Tensor _cat_int_tensors_with_padding(
     const std::vector<Tensor>& tensor_list,
     int64_t total_num,
     bool use_pin_memory,
@@ -140,7 +140,7 @@ Tensor _cat_int_tensors_with_padding(
   return combined_tensors;
 }
 
-void _cat_per_sample_weights_list_out(
+static void _cat_per_sample_weights_list_out(
     Tensor& out,
     const std::vector<Tensor>& per_sample_weights,
     const std::vector<Tensor>& indices_list,
@@ -178,7 +178,7 @@ void _cat_per_sample_weights_list_out(
   }
 }
 
-Tensor _cat_per_sample_weights_list(
+static Tensor _cat_per_sample_weights_list(
     const std::vector<Tensor>& per_sample_weights,
     const std::vector<Tensor>& indices_list,
     int64_t total_num,
@@ -375,7 +375,7 @@ void tbe_input_combine_with_length_cpu_out(
   combined_per_sample_weights.resize_({0});
 }
 
-std::tuple<Tensor, Tensor, Tensor> tbe_input_combine_with_length_cpu(
+static std::tuple<Tensor, Tensor, Tensor> tbe_input_combine_with_length_cpu(
     const std::vector<Tensor>& indices_list,
     const std::vector<Tensor>& lengths_list,
     const std::vector<Tensor>& per_sample_weights) {
@@ -518,7 +518,7 @@ std::tuple<Tensor, Tensor, Tensor> padding_fused_tbe_input_combine_cpu(
 /// @param lengths_list list of lengths.
 /// @param per_sample_weights list of per_sample_weights
 /// @return tuple of combined indices, lengths, and per_sample_weights
-std::tuple<Tensor, Tensor, Tensor>
+static std::tuple<Tensor, Tensor, Tensor>
 padding_fused_tbe_input_combine_with_length_cpu(
     const std::vector<Tensor>& indices_list,
     const std::vector<Tensor>& lengths_list,

diff --git a/fbgemm_gpu/src/input_combine_ops/input_combine_gpu.cpp b/fbgemm_gpu/src/input_combine_ops/input_combine_gpu.cpp
@@ -31,12 +31,12 @@ enum args_pos {
 };
 
 template <typename T>
-uint64_t compute_num_uint64s(const uint64_t num_elements) {
+static uint64_t compute_num_uint64s(const uint64_t num_elements) {
   const uint64_t ratio = sizeof(uint64_t) / sizeof(T);
   return (num_elements + ratio - 1) / ratio;
 }
 
-void offset_tbe_input_combine_with_length_args(
+static void offset_tbe_input_combine_with_length_args(
     uint64_t** indices_addrs,
     uint64_t** lengths_addrs,
     uint64_t** indices_offsets,
@@ -59,7 +59,7 @@ void offset_tbe_input_combine_with_length_args(
       reinterpret_cast<uint32_t*>(base_addr + ptr_offsets[P_lengths_is_long]);
 }
 
-std::tuple<Tensor, Tensor, Tensor> tbe_input_combine_with_length_gpu(
+static std::tuple<Tensor, Tensor, Tensor> tbe_input_combine_with_length_gpu(
     const std::vector<Tensor>& indices_list,
     const std::vector<Tensor>& lengths_list,
     const std::vector<Tensor>& per_sample_weights) {

diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp
@@ -788,7 +788,7 @@ class JaggedSliceOp : public torch::autograd::Function<JaggedSliceOp> {
 } // namespace
 
 ///@ingroup jagged-tensor-ops-cpu
-Tensor jagged_to_padded_dense_forward_autograd(
+static Tensor jagged_to_padded_dense_forward_autograd(
     const Tensor& values,
     const std::vector<Tensor>& offsets,
     const c10::SymIntArrayRef max_lengths,
@@ -883,7 +883,7 @@ std::tuple<Tensor, std::vector<Tensor>> dense_to_jagged(
   auto output = op.call(dense, offsets, total_L);
   return {output, offsets};
 }
-Tensor dense_to_jagged_forward_autograd(
+static Tensor dense_to_jagged_forward_autograd(
     const Tensor& dense,
     const std::vector<Tensor>& offsets,
     std::optional<at::SymInt> total_L) {