From f7ac0e408813fd67df1b1cddebea3d205d17dc0b Mon Sep 17 00:00:00 2001 From: Yuanyuan Chen Date: Sat, 20 Dec 2025 15:29:06 +0800 Subject: [PATCH 1/2] More clang-tidy fixes Signed-off-by: Yuanyuan Chen --- .../jagged_tensor_ops_cpu.cpp | 19 +++++++++++-------- .../merge_pooled_embedding_ops_gpu.cpp | 2 +- .../permute_multi_embedding_function.cpp | 5 +++-- .../lfu_cache_populate_byte.cpp | 3 +-- .../lru_cache_populate_byte.cpp | 6 ++---- .../split_embeddings_cache_ops.cpp | 2 +- 6 files changed, 19 insertions(+), 18 deletions(-) diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp index 5af92e517a..95372326ad 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp +++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp @@ -11,6 +11,9 @@ #include #include #include + +#include +#include #include "ATen/Parallel.h" #include "common.h" @@ -144,7 +147,7 @@ void jagged_dense_elementwise_dense_output_kernel_( } const int jagged_folded_size = - y.numel() / (outer_dense_size * inner_dense_size); + y.numel() / (static_cast(outer_dense_size * inner_dense_size)); const int jagged_innermost_size = y.size(-2); // Canonicalize y and output to 3D, collapsing jagged dimensions. @@ -291,7 +294,7 @@ void jagged_dense_elementwise_jagged_output_kernel_( } const int jagged_folded_size = - y.numel() / (outer_dense_size * inner_dense_size); + y.numel() / (static_cast(outer_dense_size * inner_dense_size)); const int jagged_innermost_size = y.size(-2); // Canonicalize y to 3D, collapsing jagged dimensions. @@ -547,7 +550,7 @@ void jagged_jagged_elementwise_dense_output_kernel_( } const int jagged_folded_size = - output.numel() / (outer_dense_size * inner_dense_size); + output.numel() / (static_cast(outer_dense_size * inner_dense_size)); const int jagged_innermost_size = output.size(-2); // Canonicalize output to 3D, collapsing jagged dimensions. @@ -661,14 +664,14 @@ std::tuple jagged_dense_elementwise_mul_backward( x_offsets, y, x_values_grad, - [](scalar_t x, scalar_t y) -> scalar_t { return x * y; }); + std::multiplies()); jagged_jagged_elementwise_dense_output_( grad_output, x_offsets, x_values, y_grad, - [](scalar_t x, scalar_t y) -> scalar_t { return x * y; }); + std::multiplies()); }); return {x_values_grad, y_grad}; @@ -807,7 +810,7 @@ Tensor batched_dense_vec_jagged_2d_mul_forward( v.size(0)); const int H = B == 0 ? 1 : v.size(0) / B; const int D = a_values.size(-1) / H; - auto output = at::empty({B * H, D}, v.options()); + auto output = at::empty({static_cast(B * H), D}, v.options()); if (B > 0 && D > 0) { const auto func_name = "batched_dense_vec_jagged_2d_mul_forward"; @@ -1065,7 +1068,7 @@ std::vector stacked_jagged_1d_to_dense_cpu( AT_DISPATCH_INDEX_TYPES( lengths_contig.scalar_type(), "length_to_offset_cpu_kernel", [&] { index_t cumsum = 0; - const auto* input_ptr = &(lengths_contig.data_ptr()[t * B]); + const auto* input_ptr = &(lengths_contig.data_ptr()[static_cast(t * B]); auto* output_ptr = offsets.data_ptr() + 1; for (const auto i : c10::irange(B)) { cumsum += input_ptr[i]; @@ -1104,7 +1107,7 @@ std::vector stacked_jagged_2d_to_dense_cpu( AT_DISPATCH_INDEX_TYPES( lengths_contig.scalar_type(), "length_to_offset_cpu_kernel", [&] { index_t cumsum = 0; - const auto* input_ptr = &(lengths_contig.data_ptr()[t * B]); + const auto* input_ptr = &(lengths_contig.data_ptr()[static_cast(t * B]); auto* output_ptr = offsets.data_ptr() + 1; for (const auto i : c10::irange(B)) { cumsum += input_ptr[i]; diff --git a/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_gpu.cpp b/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_gpu.cpp index e483a2893b..a08ae22f8d 100644 --- a/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_gpu.cpp +++ b/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_gpu.cpp @@ -195,7 +195,7 @@ void all_to_one( get_intermediate_node(fbgemm_gpu::get_nvlink_matrix()); for (const auto i : c10::irange(input_tensors.size())) { const auto& src = input_tensors.at(i); - Node src_device_id = src.get_device(); + auto src_device_id = src.get_device(); auto intermediate_node = intermediate_nodes(src_device_id, target_device_index); if (intermediate_node != -1) { diff --git a/fbgemm_gpu/src/permute_multi_embedding_ops/permute_multi_embedding_function.cpp b/fbgemm_gpu/src/permute_multi_embedding_ops/permute_multi_embedding_function.cpp index c269b60270..26f601461a 100644 --- a/fbgemm_gpu/src/permute_multi_embedding_ops/permute_multi_embedding_function.cpp +++ b/fbgemm_gpu/src/permute_multi_embedding_ops/permute_multi_embedding_function.cpp @@ -7,6 +7,7 @@ */ #include "fbgemm_gpu/permute_multi_embedding_function.h" +#include #include #include @@ -170,7 +171,7 @@ kt_regroup_arguments_impl( } // flattened permutes vector with size of out_num * PermuteParam::size - std::vector permutes(out_num * PermuteParam::size); + std::vector permutes(static_cast(out_num * PermuteParam::size)); int32_t* __restrict__ pp = permutes.data(); // the lengths of each output tensor int32_t* __restrict__ out_offset = out_lengths.data(); @@ -191,7 +192,7 @@ kt_regroup_arguments_impl( for (const auto& key : groups[out_tensor]) { // query the loockup dictionary for input tensor index, offset, and length auto [in_tensor, length, in_offset] = lookup.at(key); - int32_t* __restrict__ curr_pp = pp + curr * PermuteParam::size; + int32_t* __restrict__ curr_pp = pp + static_cast(curr * PermuteParam::size); curr_pp[PermuteParam::in_tensor] = in_tensor; curr_pp[PermuteParam::out_tensor] = out_tensor; diff --git a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate_byte.cpp b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate_byte.cpp index ede3f0304d..6272f46e38 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate_byte.cpp +++ b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate_byte.cpp @@ -26,7 +26,6 @@ DLL_PUBLIC void lfu_cache_populate_byte_cpu( Tensor /*lxu_cache_weights*/, Tensor /*lfu_state*/, int64_t /*row_alignment*/) { - return; -} + } } // namespace fbgemm_gpu diff --git a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cpp b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cpp index defefa5051..9ceb87569b 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cpp +++ b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cpp @@ -28,8 +28,7 @@ DLL_PUBLIC void lru_cache_populate_byte_cpu( int64_t /*row_alignment*/, bool /*gather_cache_stats*/, std::optional /*uvm_cache_stats*/) { - return; -} + } DLL_PUBLIC void direct_mapped_lru_cache_populate_byte_cpu( Tensor /*weights*/, @@ -48,7 +47,6 @@ DLL_PUBLIC void direct_mapped_lru_cache_populate_byte_cpu( int64_t /*row_alignment*/, bool /*gather_cache_stats*/, std::optional /*uvm_cache_stats*/) { - return; -} + } } // namespace fbgemm_gpu diff --git a/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cpp b/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cpp index 6bbd1dd7a6..7f57b4fd23 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cpp +++ b/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cpp @@ -77,7 +77,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) { DISPATCH_TO_META("lxu_cache_lookup", lxu_cache_lookup_meta); } -static auto raw_embedding_streamer = +auto raw_embedding_streamer = torch::class_( "fbgemm", "RawEmbeddingStreamer") From 1cc27afa076f4a11706f39912941774e6597cfc5 Mon Sep 17 00:00:00 2001 From: Yuanyuan Chen Date: Sat, 20 Dec 2025 15:51:13 +0800 Subject: [PATCH 2/2] Replace .data_ptr Signed-off-by: Yuanyuan Chen --- ...bedding_forward_quantized_cpu_template.cpp | 36 ++++---- .../embedding_forward_quantized_host_cpu.cpp | 10 +- ...ing_backward_split_cpu_approx_template.cpp | 6 +- .../embedding_backward_split_cpu_template.cpp | 10 +- .../forward/embedding_forward_split_cpu.cpp | 8 +- .../embedding_forward_split_nobag_cpu.cpp | 8 +- .../embedding_forward_split_template.cu | 2 +- .../batch_index_select_dim0_cpu_host.cpp | 2 +- .../batch_index_select_dim0_host.cpp | 4 +- .../example/src/cutlass_sgemm_nn.cu | 6 +- .../blackwell_gen_impl.cu | 6 +- .../src/kv_cache/kv_cache_dequantize.cu | 2 +- .../cutlass_extensions/f8f8bf16_cublas.cu | 4 +- .../fast_gemv/bf16fp8bf16_fast_gemv.cu | 2 +- .../fast_gemv/fp8fp8bf16_fast_gemv.cu | 4 +- .../gen_ai/src/quantize/quantize.cu | 44 ++++----- .../include/fbgemm_gpu/utils/tensor_utils.h | 4 +- .../dram_kv_embedding_cache.h | 16 ++-- .../dram_kv_inference_embedding.h | 4 +- .../embedding_inplace_update_cpu.cpp | 12 +-- .../src/jagged_tensor_ops/get_source_mask.cu | 2 +- .../jagged_tensor_ops_cpu.cpp | 21 ++--- .../layout_transform_ops.cu | 4 +- fbgemm_gpu/src/metric_ops/metric_ops.cu | 2 +- .../permute_multi_embedding_ops.cu | 4 +- .../permute_multi_embedding_ops_cpu.cpp | 2 +- .../quantize_fused_8bit_rowwise.cu | 8 +- .../quantize_fused_nbit_rowwise.cu | 4 +- .../src/quantize_ops/quantize_ops_cpu.cpp | 14 +-- .../quantize_padded_fp8_rowwise.cu | 6 +- .../src/sparse_ops/sparse_async_cumsum.cpp | 2 +- .../sparse_batched_unary_embeddings.cu | 4 +- .../sparse_compute_frequency_sequence.cu | 2 +- fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp | 92 +++++++++---------- fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp | 6 +- .../sparse_pack_segments_backward.cu | 4 +- .../sparse_pack_segments_forward.cu | 12 +-- fbgemm_gpu/src/sparse_ops/sparse_range.cu | 2 +- .../src/sparse_ops/sparse_segment_sum_csr.cu | 2 +- .../split_embeddings_cache/cachelib_cache.cpp | 2 +- fbgemm_gpu/src/tbe/eeg/indices_estimator.cpp | 2 +- 41 files changed, 192 insertions(+), 195 deletions(-) diff --git a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp index 1a2942071c..567494b457 100644 --- a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp +++ b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp @@ -78,9 +78,9 @@ void pruned_hashmap_insert_{{ wdesc }}_cpu( using uidx_t = std::conditional_t, uint64_t, uint32_t>; - const auto* indices_acc = indices.data_ptr(); - const auto* dense_indices_acc = dense_indices.data_ptr(); - const auto* offsets_acc = offsets.data_ptr(); + const auto* indices_acc = indices.const_data_ptr(); + const auto* dense_indices_acc = dense_indices.const_data_ptr(); + const auto* offsets_acc = offsets.const_data_ptr(); auto hash_table_acc = hash_table.accessor(); const auto hash_table_offsets_acc = hash_table_offsets.accessor(); @@ -228,14 +228,14 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ return output; } - const int32_t* weights_placements_ptr = weights_placements.data_ptr(); + const int32_t* weights_placements_ptr = weights_placements.const_data_ptr(); const uint8_t* weights_acc; - const auto* weights_tys_acc = weights_tys.data_ptr(); + const auto* weights_tys_acc = weights_tys.const_data_ptr(); DISPATCH_OUTPUT_TYPES(output.scalar_type(), "intn_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", [&] { {% if weighted %} - const float* indice_weights_acc = indice_weights.data_ptr(); + const float* indice_weights_acc = indice_weights.const_data_ptr(); {% endif %} using float16 = uint16_t; @@ -250,15 +250,15 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ float16, std::conditional::value, bfloat16, float>::type> ::type; AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_", [&] { - const auto* indices_acc = indices.data_ptr(); - const auto* offsets_acc = offsets.data_ptr(); - const auto* weights_offsets_acc = weights_offsets.data_ptr(); + const auto* indices_acc = indices.const_data_ptr(); + const auto* offsets_acc = offsets.const_data_ptr(); + const auto* weights_offsets_acc = weights_offsets.const_data_ptr(); - auto* output_acc = output.data_ptr(); + auto* output_acc = output.mutable_data_ptr(); for (const auto t : c10::irange(T)) { {% if not nobag %} - const auto* D_offsets_acc = D_offsets.data_ptr(); + const auto* D_offsets_acc = D_offsets.const_data_ptr(); const int32_t D_start = D_offsets_acc[t]; const int32_t D_end = D_offsets_acc[t + 1]; const int32_t D = D_end - D_start; @@ -295,7 +295,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ {% if nobag %} // Create virtual offsets for the nobag case. Lengths are all ones. const auto offsets_nobag = at::arange(*offsets_begin_ptr, offsets_acc[(t + 1) * B] + 1, offsets.options()); - const index_t* offsets_nobag_ptr = offsets_nobag.data_ptr(); + const index_t* offsets_nobag_ptr = offsets_nobag.const_data_ptr(); TORCH_CHECK(offsets_nobag.numel() == index_size + 1); TORCH_CHECK(offsets_nobag_ptr[index_size] - offsets_nobag_ptr[0] == index_size); {% endif %} @@ -449,9 +449,9 @@ Tensor pruned_hashmap_lookup_{{ wdesc }}_cpu( using utdx_t = std::conditional_t, uint64_t, uint32_t>; - const auto* indices_acc = indices.data_ptr(); + const auto* indices_acc = indices.const_data_ptr(); auto* dense_indices_acc = dense_indices.data_ptr(); - const auto* offsets_acc = offsets.data_ptr(); + const auto* offsets_acc = offsets.const_data_ptr(); const auto hash_table_acc = hash_table.accessor(); const auto hash_table_offsets_acc = hash_table_offsets.accessor(); @@ -527,12 +527,12 @@ Tensor pruned_array_lookup_cpu( using remap_t = index_t; AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "pruned_array_lookup_cpu_1", [&] { - const auto* indices_acc = indices.data_ptr(); + const auto* indices_acc = indices.const_data_ptr(); auto* dense_indices_acc = dense_indices.data_ptr(); - const auto* offsets_acc = offsets.data_ptr(); + const auto* offsets_acc = offsets.const_data_ptr(); - const auto index_remappings_acc = index_remappings.data_ptr(); - const auto index_remappings_offsets_acc = index_remappings_offsets.data_ptr(); + const auto index_remappings_acc = index_remappings.const_data_ptr(); + const auto index_remappings_offsets_acc = index_remappings_offsets.const_data_ptr(); at::parallel_for(0, T, 1, [&](int64_t begin, int64_t end) { for (const auto t : c10::irange(begin, end)) { diff --git a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_host_cpu.cpp b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_host_cpu.cpp index 98f0235ac0..d0e6ec5e76 100644 --- a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_host_cpu.cpp +++ b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_host_cpu.cpp @@ -412,9 +412,9 @@ class PrunedMapCPU : public torch::jit::CustomClassHolder { void insert(Tensor indices, Tensor dense_indices, Tensor offsets, int64_t T) { int32_t B = (offsets.size(0) - 1) / T; TORCH_CHECK(B > 0); - const auto* indices_acc = indices.data_ptr(); + const auto* indices_acc = indices.const_data_ptr(); auto* dense_indices_acc = dense_indices.data_ptr(); - const auto* offsets_acc = offsets.data_ptr(); + const auto* offsets_acc = offsets.const_data_ptr(); maps_.resize(T); for (const auto t : c10::irange(T)) { auto& map = maps_[t]; @@ -447,9 +447,9 @@ class PrunedMapCPU : public torch::jit::CustomClassHolder { auto dense_indices = empty_like(indices); AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "PrunedMapCPU::lookup", [&] { - const auto* indices_acc = indices.data_ptr(); + const auto* indices_acc = indices.const_data_ptr(); auto* dense_indices_acc = dense_indices.data_ptr(); - const auto* offsets_acc = offsets.data_ptr(); + const auto* offsets_acc = offsets.const_data_ptr(); for (const auto t : c10::irange(T)) { auto& map = maps_[t]; @@ -563,7 +563,7 @@ struct TensorQueue : torch::CustomClassHolder { const std::string key = "queue"; Tensor size_tensor; size_tensor = dict.at(std::string(key + "/size")).cpu(); - const auto* size_tensor_acc = size_tensor.data_ptr(); + const auto* size_tensor_acc = size_tensor.const_data_ptr(); int64_t queue_size = size_tensor_acc[0]; for (const auto index : c10::irange(queue_size)) { diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_cpu_approx_template.cpp b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_cpu_approx_template.cpp index 2069d13048..8c1f5a68ca 100644 --- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_cpu_approx_template.cpp +++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_cpu_approx_template.cpp @@ -145,11 +145,11 @@ split_embedding_backward_codegen_{{ optimizer }}_cpu( AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "split_embedding_backward_approx_cpu_kernel_1", [&] { auto grad_stride = grad_output.size(1); - const float* grad_output_data = grad_output.data_ptr(); + const float* grad_output_data = grad_output.const_data_ptr(); float* host_weights_data = host_weights.data_ptr(); - const auto* indices_data = indices.data_ptr(); - const auto* offsets_data = offsets.data_ptr(); + const auto* indices_data = indices.const_data_ptr(); + const auto* offsets_data = offsets.const_data_ptr(); const auto hash_size_cumsum_data = hash_size_cumsum.accessor(); float* momentum1_data = momentum1_host.data_ptr(); diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_cpu_template.cpp b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_cpu_template.cpp index 8a9e4e1d7f..1c4844ee79 100644 --- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_cpu_template.cpp +++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_cpu_template.cpp @@ -67,7 +67,7 @@ void split_embedding_backward_exact_cpu_kernel( const at::TensorAccessor momentum2_offsets_data, {% endif %} {{ args.split_cpu_kernel_args | join(", ") }}) { - const grad_t* grad_output_data = grad_output.data_ptr(); + const grad_t* grad_output_data = grad_output.const_data_ptr(); auto host_weights_data = host_weights.accessor(); const auto hash_size_cumsum_data = hash_size_cumsum.accessor(); @@ -252,12 +252,12 @@ void split_embedding_nobag_backward_exact_cpu_kernel( const at::TensorAccessor momentum2_offsets_data, {% endif %} {{ args.split_cpu_kernel_args | join(", ") }}) { - const grad_t* grad_output_data = grad_output.data_ptr(); + const grad_t* grad_output_data = grad_output.const_data_ptr(); auto host_weights_data = host_weights.accessor(); const auto hash_size_cumsum_data = hash_size_cumsum.accessor(); - const auto indices_data = indices.data_ptr(); - const auto offsets_data = offsets.data_ptr(); - const auto weights_offsets_data = weights_offsets.data_ptr(); + const auto indices_data = indices.const_data_ptr(); + const auto offsets_data = offsets.const_data_ptr(); + const auto weights_offsets_data = weights_offsets.const_data_ptr(); typedef std::unordered_map>> tb_grad_buffer_map_t; typedef std::unordered_map tb_fb_map_t; diff --git a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_cpu.cpp b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_cpu.cpp index ac17ab5959..6659a0d684 100644 --- a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_cpu.cpp +++ b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_cpu.cpp @@ -67,18 +67,18 @@ void split_embedding_forward_cpu_kernel( const auto D_offsets_data = D_offsets.accessor(); const auto weights_offsets_data = weights_offsets.accessor(); - const auto indices_data = indices.data_ptr(); - const auto offsets_data = offsets.data_ptr(); + const auto indices_data = indices.const_data_ptr(); + const auto offsets_data = offsets.const_data_ptr(); const auto hash_size_cumsum_data = hash_size_cumsum.accessor(); - const auto weights_data = weights.data_ptr(); + const auto weights_data = weights.const_data_ptr(); // If indice_weights not defined, then this accessor won't be used. // The else condition is just to make compiler happy const auto indice_weights_data = indice_weights.defined() ? indice_weights.data_ptr() : nullptr; - auto output_data = output.data_ptr(); + auto output_data = output.mutable_data_ptr(); auto output_stride = output.size(1); constexpr bool use_fbgemm = (std::is_same::value || diff --git a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_nobag_cpu.cpp b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_nobag_cpu.cpp index cb5fb17d65..fb8d6db75c 100644 --- a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_nobag_cpu.cpp +++ b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_nobag_cpu.cpp @@ -43,10 +43,10 @@ void split_embedding_nobag_codegen_forward_cpu_kernel( const auto weights_offsets_data = weights_offsets.accessor(); const auto hash_size_cumsum_data = hash_size_cumsum.accessor(); - const auto indices_data = indices.data_ptr(); - const auto offsets_data = offsets.data_ptr(); - const auto weights_data = weights.data_ptr(); - auto output_data = output.data_ptr(); + const auto indices_data = indices.const_data_ptr(); + const auto offsets_data = offsets.const_data_ptr(); + const auto weights_data = weights.const_data_ptr(); + auto output_data = output.mutable_data_ptr(); int64_t T = weights_offsets.size(0); int64_t B = (offsets.size(0) - 1) / T; diff --git a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu index a3edb6b965..b7b2641491 100755 --- a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu +++ b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu @@ -846,7 +846,7 @@ batch_index_select_dim0_codegen_forward_cuda( reinterpret_cast(D_offsets.data_ptr()), weights_offsets.data_ptr(), lxu_cache_locations.data_ptr(), - output.data_ptr() + output.mutable_data_ptr() ); } {%- endif %} // if has_experimental diff --git a/fbgemm_gpu/codegen/training/index_select/batch_index_select_dim0_cpu_host.cpp b/fbgemm_gpu/codegen/training/index_select/batch_index_select_dim0_cpu_host.cpp index 91b9d24177..b55ef868c7 100644 --- a/fbgemm_gpu/codegen/training/index_select/batch_index_select_dim0_cpu_host.cpp +++ b/fbgemm_gpu/codegen/training/index_select/batch_index_select_dim0_cpu_host.cpp @@ -37,7 +37,7 @@ Tensor tensor_from_vec(const std::vector& vec) { std::vector vecref_from_tensor(const Tensor& t) { TORCH_CHECK(t.is_contiguous()); const auto numel = static_cast(t.numel()); - const auto* ptr = t.data_ptr(); + const auto* ptr = t.const_data_ptr(); return std::vector(ptr, ptr + numel); }; diff --git a/fbgemm_gpu/codegen/training/index_select/batch_index_select_dim0_host.cpp b/fbgemm_gpu/codegen/training/index_select/batch_index_select_dim0_host.cpp index 18378b6106..fb0b171143 100644 --- a/fbgemm_gpu/codegen/training/index_select/batch_index_select_dim0_host.cpp +++ b/fbgemm_gpu/codegen/training/index_select/batch_index_select_dim0_host.cpp @@ -291,7 +291,7 @@ class BatchIndexSelectDim0GPUOp auto _grad_output = grad_output; // FIXME: to support aligned memory access in Vec4T load/store function // 16 for FP32 and 8 for FP16 - if (reinterpret_cast(grad_output.data_ptr()) % 16 != 0 || + if (reinterpret_cast(grad_output.mutable_data_ptr()) % 16 != 0 || at::has_internal_overlap(grad_output) != at::MemOverlap::No) { _grad_output = at::empty_like(grad_output).copy_(grad_output); } @@ -612,7 +612,7 @@ class BatchIndexSelectDim0TensorGPUOp auto _grad_output = grad_output; // FIXME: to support aligned memory access in Vec4T load/store function // 16 for FP32 and 8 for FP16 - if (reinterpret_cast(grad_output.data_ptr()) % 16 != 0 || + if (reinterpret_cast(grad_output.mutable_data_ptr()) % 16 != 0 || at::has_internal_overlap(grad_output) != at::MemOverlap::No) { _grad_output = at::empty_like(grad_output).copy_(grad_output); } diff --git a/fbgemm_gpu/experimental/example/src/cutlass_sgemm_nn.cu b/fbgemm_gpu/experimental/example/src/cutlass_sgemm_nn.cu index 22a52c9bbe..8a9097fc11 100644 --- a/fbgemm_gpu/experimental/example/src/cutlass_sgemm_nn.cu +++ b/fbgemm_gpu/experimental/example/src/cutlass_sgemm_nn.cu @@ -35,9 +35,9 @@ at::Tensor sgemm_float_cuda( const auto ldb = N; const auto ldc = N; - const auto* A = TA.data_ptr(); - const auto* B = TB.data_ptr(); - const auto* C = TC.data_ptr(); + const auto* A = TA.const_data_ptr(); + const auto* B = TB.const_data_ptr(); + const auto* C = TC.const_data_ptr(); const auto alpha = static_cast(alpha_); const auto beta = static_cast(beta_); diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/blackwell_gen_impl.cu b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/blackwell_gen_impl.cu index 638b6a495d..850a911741 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/blackwell_gen_impl.cu +++ b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/blackwell_gen_impl.cu @@ -270,11 +270,11 @@ struct GenRunner { typename Operation::Arguments arguments{ problem_shape, - static_cast(seqlen_kv.data_ptr()), - static_cast(batch_idx ? batch_idx.value().data_ptr() : nullptr), + static_cast(seqlen_kv.const_data_ptr()), + static_cast(batch_idx ? batch_idx.value().const_data_ptr() : nullptr), static_cast(split_k_size), static_cast(window_size), - static_cast(q.data_ptr()), + static_cast(q.const_data_ptr()), stride_q, static_cast(nullptr), // ptr_new_k stride_new_k, diff --git a/fbgemm_gpu/experimental/gen_ai/src/kv_cache/kv_cache_dequantize.cu b/fbgemm_gpu/experimental/gen_ai/src/kv_cache/kv_cache_dequantize.cu index d26e2a370b..a029ca04a3 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/kv_cache/kv_cache_dequantize.cu +++ b/fbgemm_gpu/experimental/gen_ai/src/kv_cache/kv_cache_dequantize.cu @@ -757,7 +757,7 @@ at::Tensor quantize_qkv_per_head( dim3 block_size(kThreadsPerWarp, kWarpsPerBlock); dim3 grid_size(kMaxBlocks); auto scale_q = at::zeros({B, N_KVH_L}, XQ_O.options().dtype(at::kFloat)); - float* const scale_q_ptr = scale_q.data_ptr(); + float* const scale_q_ptr = scale_q.const_data_ptr(); // Launch the kernel FBGEMM_LAUNCH_KERNEL( diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_cublas.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_cublas.cu index 81788ffb9d..417ebb6e56 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_cublas.cu +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_cublas.cu @@ -105,7 +105,7 @@ at::Tensor f8f8bf16_cublas( sizeof(fastAccuMode))); if (Ainvs.has_value()) { - const float* Ainvs_pt = Ainvs.value().data_ptr(); + const float* Ainvs_pt = Ainvs.value().const_data_ptr(); checkCublasStatus(cublasLtMatmulDescSetAttribute( operationDesc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, @@ -114,7 +114,7 @@ at::Tensor f8f8bf16_cublas( } if (Binvs.has_value()) { - const float* Binvs_pt = Binvs.value().data_ptr(); + const float* Binvs_pt = Binvs.value().const_data_ptr(); checkCublasStatus(cublasLtMatmulDescSetAttribute( operationDesc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/fast_gemv/bf16fp8bf16_fast_gemv.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/fast_gemv/bf16fp8bf16_fast_gemv.cu index f959d77f36..cf08784c62 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/fast_gemv/bf16fp8bf16_fast_gemv.cu +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/fast_gemv/bf16fp8bf16_fast_gemv.cu @@ -91,7 +91,7 @@ bf16fp8bf16_fast_gemv(at::Tensor X, at::Tensor W, at::Tensor w_scale) { k, m, n, - reinterpret_cast(w_scale.data_ptr()), + reinterpret_cast(w_scale.const_data_ptr()), num_per_thread); C10_CUDA_KERNEL_LAUNCH_CHECK(); diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/fast_gemv/fp8fp8bf16_fast_gemv.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/fast_gemv/fp8fp8bf16_fast_gemv.cu index bb10260844..b89fb60520 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/fast_gemv/fp8fp8bf16_fast_gemv.cu +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/fast_gemv/fp8fp8bf16_fast_gemv.cu @@ -245,8 +245,8 @@ at::Tensor fp8fp8bf16_fast_gemv( k, m, n, - reinterpret_cast(w_scale.data_ptr()), - reinterpret_cast(x_scale.data_ptr())); + reinterpret_cast(w_scale.const_data_ptr()), + reinterpret_cast(x_scale.const_data_ptr())); if (!dispatched) { throw std::runtime_error("f8f8bf16_fast_gemv cannot run."); diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu index 2b7b23f9a4..bef3630f70 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu @@ -820,7 +820,7 @@ at::Tensor get_fp8_per_tensor_scale( const auto stream = at::cuda::getCurrentCUDAStream(); invokeComputeScale( reinterpret_cast(scale.data_ptr()), - reinterpret_cast(input.data_ptr()), + reinterpret_cast(input.const_data_ptr()), input.numel(), input.size(-1), input.size(0), @@ -867,7 +867,7 @@ at::Tensor quantize_fp8_per_tensor_fixed_scale( invokeQuantizeMatrix( reinterpret_cast<__nv_fp8_e4m3*>(quantized_input.data_ptr()), reinterpret_cast(scale.data_ptr()), - reinterpret_cast(input.data_ptr()), + reinterpret_cast(input.const_data_ptr()), input.numel(), input.size(-1), stochastic_rounding, @@ -927,7 +927,7 @@ std::vector quantize_fp8_per_tensor( } invokeComputeScale( reinterpret_cast(scales.data_ptr()), - reinterpret_cast(input.data_ptr()), + reinterpret_cast(input.const_data_ptr()), input.numel(), input.size(-1), total_elements_per_slice, @@ -940,7 +940,7 @@ std::vector quantize_fp8_per_tensor( invokeQuantizeMatrix( quantized_input_ptr, reinterpret_cast(scales.data_ptr()), - reinterpret_cast(input.data_ptr()), + reinterpret_cast(input.const_data_ptr()), input.numel(), input.size(-1), stochastic_rounding, @@ -949,7 +949,7 @@ std::vector quantize_fp8_per_tensor( invokeQuantizeMatrix( quantized_input_ptr, reinterpret_cast(scales.data_ptr()), - reinterpret_cast(input.data_ptr()), + reinterpret_cast(input.const_data_ptr()), input.numel(), input.size(-1), stochastic_rounding, @@ -958,7 +958,7 @@ std::vector quantize_fp8_per_tensor( } else { invokeComputeScale( reinterpret_cast(scales.data_ptr()), - reinterpret_cast(input.data_ptr()), + reinterpret_cast(input.const_data_ptr()), input.numel(), input.size(-1), -1, @@ -971,7 +971,7 @@ std::vector quantize_fp8_per_tensor( invokeQuantizeMatrix( quantized_input_ptr, reinterpret_cast(scales.data_ptr()), - reinterpret_cast(input.data_ptr()), + reinterpret_cast(input.const_data_ptr()), input.numel(), input.size(-1), stochastic_rounding, @@ -980,7 +980,7 @@ std::vector quantize_fp8_per_tensor( invokeQuantizeMatrix( quantized_input_ptr, reinterpret_cast(scales.data_ptr()), - reinterpret_cast(input.data_ptr()), + reinterpret_cast(input.const_data_ptr()), input.numel(), input.size(-1), stochastic_rounding, @@ -1329,7 +1329,7 @@ std::vector quantize_fp8_per_row( // optional upper‑bound pointer const float* scale_ub_ptr = nullptr; if (scale_ub.has_value()) { - scale_ub_ptr = reinterpret_cast(scale_ub.value().data_ptr()); + scale_ub_ptr = reinterpret_cast(scale_ub.value().const_data_ptr()); } // launch parameters const int threads = 128; // 128 threads / block @@ -1342,7 +1342,7 @@ std::vector quantize_fp8_per_row( <<>>( reinterpret_cast<__nv_fp8_e4m3*>(quantized.data_ptr()), reinterpret_cast(scales.data_ptr()), - reinterpret_cast(input.data_ptr()), + reinterpret_cast(input.const_data_ptr()), static_cast(K), scale_ub_ptr); } else { @@ -1350,7 +1350,7 @@ std::vector quantize_fp8_per_row( <<>>( reinterpret_cast<__nv_fp8_e5m2*>(quantized.data_ptr()), reinterpret_cast(scales.data_ptr()), - reinterpret_cast(input.data_ptr()), + reinterpret_cast(input.const_data_ptr()), static_cast(K), scale_ub_ptr); } @@ -1401,7 +1401,7 @@ std::vector quantize_fp8_per_col( invokeComputeScalesAndQuantizeMatrixCol( quantized_input_ptr, reinterpret_cast(scales.data_ptr()), - reinterpret_cast(input.data_ptr()), + reinterpret_cast(input.const_data_ptr()), input.numel(), input.size(-1), stream); @@ -2108,7 +2108,7 @@ std::vector fake_quantize_nvfp4_per_tensor( if (!static_scales.has_value()) { invokeComputeFP4GlobalAmax( reinterpret_cast(scales.data_ptr()), - reinterpret_cast(input.data_ptr()), + reinterpret_cast(input.const_data_ptr()), input.numel(), input.size(-1), total_elements_per_slice, @@ -2120,8 +2120,8 @@ std::vector fake_quantize_nvfp4_per_tensor( } fp4_fused_amax_quantize( quantized_input_ptr, - reinterpret_cast(scales.data_ptr()), - reinterpret_cast(input.data_ptr()), + reinterpret_cast(scales.const_data_ptr()), + reinterpret_cast(input.const_data_ptr()), input.numel(), 16, stream); @@ -2129,7 +2129,7 @@ std::vector fake_quantize_nvfp4_per_tensor( if (!static_scales.has_value()) { invokeComputeFP4GlobalAmax( reinterpret_cast(scales.data_ptr()), - reinterpret_cast(input.data_ptr()), + reinterpret_cast(input.const_data_ptr()), input.numel(), input.size(-1), -1, @@ -2141,8 +2141,8 @@ std::vector fake_quantize_nvfp4_per_tensor( } fp4_fused_amax_quantize( quantized_input_ptr, - reinterpret_cast(scales.data_ptr()), - reinterpret_cast(input.data_ptr()), + reinterpret_cast(scales.const_data_ptr()), + reinterpret_cast(input.const_data_ptr()), input.numel(), 16, stream); @@ -2165,9 +2165,9 @@ void scaled_fp4_quant( int multiProcessorCount = get_device_attribute(cudaDevAttrMultiProcessorCount, -1); - auto input_sf_ptr = static_cast(input_sf.data_ptr()); + auto input_sf_ptr = static_cast(input_sf.const_data_ptr()); auto sf_out = static_cast(output_sf.data_ptr()); - auto output_ptr = static_cast(output.data_ptr()); + auto output_ptr = static_cast(output.mutable_data_ptr()); at::cuda::CUDAGuard device_guard{(char)input.get_device()}; auto stream = at::cuda::getStreamFromPool(false, input.get_device()); if (stream == nullptr) { @@ -2179,7 +2179,7 @@ void scaled_fp4_quant( switch (input.scalar_type()) { case torch::kHalf: { - auto input_ptr = reinterpret_cast(input.data_ptr()); + auto input_ptr = reinterpret_cast(input.const_data_ptr()); invokeFP4Quantization( m, n, @@ -2193,7 +2193,7 @@ void scaled_fp4_quant( break; } case torch::kBFloat16: { - auto input_ptr = reinterpret_cast<__nv_bfloat16 const*>(input.data_ptr()); + auto input_ptr = reinterpret_cast<__nv_bfloat16 const*>(input.const_data_ptr()); invokeFP4Quantization( m, n, diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/tensor_utils.h b/fbgemm_gpu/include/fbgemm_gpu/utils/tensor_utils.h index 8cca21a11d..50d4e978f5 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/utils/tensor_utils.h +++ b/fbgemm_gpu/include/fbgemm_gpu/utils/tensor_utils.h @@ -316,13 +316,13 @@ inline at::Tensor aligned_grad_output_tensor_for_cuda_backwards( if (!aligned_grad_output.is_contiguous()) { aligned_grad_output = aligned_grad_output.contiguous(); } - if (reinterpret_cast(aligned_grad_output.data_ptr()) % 16 != 0) { + if (reinterpret_cast(aligned_grad_output.mutable_data_ptr()) % 16 != 0) { aligned_grad_output = at::empty_like(aligned_grad_output).copy_(aligned_grad_output); } TORCH_CHECK(aligned_grad_output.is_contiguous()); TORCH_CHECK( - reinterpret_cast(aligned_grad_output.data_ptr()) % 16 == 0); + reinterpret_cast(aligned_grad_output.mutable_data_ptr()) % 16 == 0); return aligned_grad_output; } diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h index 4d1d2895a6..736cc2bf64 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h @@ -277,7 +277,7 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB { using index_t = scalar_t; CHECK(indices.is_contiguous()); auto indices_data_ptr = indices.data_ptr(); - auto* metadata = metadata_tensor.data_ptr(); + auto* metadata = metadata_tensor.mutable_data_ptr(); { auto before_read_lock_ts = facebook::WallClockUtil::NowInUsecFast(); @@ -397,7 +397,7 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB { CHECK(weights.is_contiguous()); CHECK_EQ(indices.size(0), weights.size(0)); int64_t stride = weights.size(1); - auto indices_data_ptr = indices.data_ptr(); + auto indices_data_ptr = indices.const_data_ptr(); auto weights_data_ptr = weights.data_ptr(); { auto before_write_lock_ts = @@ -768,8 +768,8 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB { CHECK(indices.is_contiguous()); CHECK(engege_rates.is_contiguous()); CHECK_EQ(indices.size(0), engege_rates.size(0)); - auto indices_data_ptr = indices.data_ptr(); - auto engage_rate_ptr = engege_rates.data_ptr(); + auto indices_data_ptr = indices.const_data_ptr(); + auto engage_rate_ptr = engege_rates.const_data_ptr(); { auto before_write_lock_ts = facebook::WallClockUtil::NowInUsecFast(); @@ -886,8 +886,8 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB { int64_t width_offset = 0, std::optional width_length = std::nullopt) { auto read_count = count.scalar_type() == at::ScalarType::Long - ? *(count.data_ptr()) - : *(count.data_ptr()); + ? *(count.const_data_ptr()) + : *(count.const_data_ptr()); read_num_counts_ += read_count; // assuming get is called once each iteration and only by train // iteration(excluding state_dict) @@ -1407,8 +1407,8 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB { // of // entries. auto conv_count = count.scalar_type() == at::ScalarType::Long - ? *(count.data_ptr()) - : *(count.data_ptr()); + ? *(count.const_data_ptr()) + : *(count.const_data_ptr()); auto indices_data_ptr = indices.data_ptr(); // There could be negative indices, which we should skipp for (int i = 0; i < conv_count; i++) { diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_inference_embedding.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_inference_embedding.h index 57c1f28160..11f28f0d0e 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_inference_embedding.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_inference_embedding.h @@ -784,8 +784,8 @@ class DramKVInferenceEmbedding // of // entries. auto conv_count = count.scalar_type() == at::ScalarType::Long - ? *(count.data_ptr()) - : *(count.data_ptr()); + ? *(count.const_data_ptr()) + : *(count.const_data_ptr()); auto indices_data_ptr = indices.data_ptr(); // There could be negative indices, which we should skipp for (int i = 0; i < conv_count; i++) { diff --git a/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp index 5309a78f7b..9bbe7dc355 100644 --- a/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp +++ b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp @@ -149,12 +149,12 @@ void dram_kv_embedding_inplace_update_cpu( auto embedding_log_inplace_update_stats_method = tbe_module->find_method("log_inplace_update_stats"); - const uint8_t* weights_tys_ptr = weights_tys.data_ptr(); - const int32_t* D_offsets_ptr = D_offsets.data_ptr(); - const uint8_t* update_weights_ptr = update_weights.data_ptr(); - const int32_t* update_table_idx_ptr = update_table_idx.data_ptr(); - const int64_t* update_row_idx_ptr = update_row_idx.data_ptr(); - const int64_t* update_offsets_ptr = update_offsets.data_ptr(); + const uint8_t* weights_tys_ptr = weights_tys.const_data_ptr(); + const int32_t* D_offsets_ptr = D_offsets.const_data_ptr(); + const uint8_t* update_weights_ptr = update_weights.const_data_ptr(); + const int32_t* update_table_idx_ptr = update_table_idx.const_data_ptr(); + const int64_t* update_row_idx_ptr = update_row_idx.const_data_ptr(); + const int64_t* update_offsets_ptr = update_offsets.const_data_ptr(); int64_t window_start = 0; while (window_start < N) { diff --git a/fbgemm_gpu/src/jagged_tensor_ops/get_source_mask.cu b/fbgemm_gpu/src/jagged_tensor_ops/get_source_mask.cu index 1d0c372508..a367c5ad94 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops/get_source_mask.cu +++ b/fbgemm_gpu/src/jagged_tensor_ops/get_source_mask.cu @@ -95,7 +95,7 @@ Tensor get_source_mask_cuda( num_sources.data_ptr(), num_targets.data_ptr(), offsets.data_ptr(), - output.data_ptr(), + output.mutable_data_ptr(), static_cast(batch_size)); })); diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp index 95372326ad..ea28cfbb65 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp +++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp @@ -11,9 +11,6 @@ #include #include #include - -#include -#include #include "ATen/Parallel.h" #include "common.h" @@ -147,7 +144,7 @@ void jagged_dense_elementwise_dense_output_kernel_( } const int jagged_folded_size = - y.numel() / (static_cast(outer_dense_size * inner_dense_size)); + y.numel() / (outer_dense_size * inner_dense_size); const int jagged_innermost_size = y.size(-2); // Canonicalize y and output to 3D, collapsing jagged dimensions. @@ -294,7 +291,7 @@ void jagged_dense_elementwise_jagged_output_kernel_( } const int jagged_folded_size = - y.numel() / (static_cast(outer_dense_size * inner_dense_size)); + y.numel() / (outer_dense_size * inner_dense_size); const int jagged_innermost_size = y.size(-2); // Canonicalize y to 3D, collapsing jagged dimensions. @@ -550,7 +547,7 @@ void jagged_jagged_elementwise_dense_output_kernel_( } const int jagged_folded_size = - output.numel() / (static_cast(outer_dense_size * inner_dense_size)); + output.numel() / (outer_dense_size * inner_dense_size); const int jagged_innermost_size = output.size(-2); // Canonicalize output to 3D, collapsing jagged dimensions. @@ -664,14 +661,14 @@ std::tuple jagged_dense_elementwise_mul_backward( x_offsets, y, x_values_grad, - std::multiplies()); + [](scalar_t x, scalar_t y) -> scalar_t { return x * y; }); jagged_jagged_elementwise_dense_output_( grad_output, x_offsets, x_values, y_grad, - std::multiplies()); + [](scalar_t x, scalar_t y) -> scalar_t { return x * y; }); }); return {x_values_grad, y_grad}; @@ -810,7 +807,7 @@ Tensor batched_dense_vec_jagged_2d_mul_forward( v.size(0)); const int H = B == 0 ? 1 : v.size(0) / B; const int D = a_values.size(-1) / H; - auto output = at::empty({static_cast(B * H), D}, v.options()); + auto output = at::empty({B * H, D}, v.options()); if (B > 0 && D > 0) { const auto func_name = "batched_dense_vec_jagged_2d_mul_forward"; @@ -1068,7 +1065,7 @@ std::vector stacked_jagged_1d_to_dense_cpu( AT_DISPATCH_INDEX_TYPES( lengths_contig.scalar_type(), "length_to_offset_cpu_kernel", [&] { index_t cumsum = 0; - const auto* input_ptr = &(lengths_contig.data_ptr()[static_cast(t * B]); + const auto* input_ptr = &(lengths_contig.data_ptr()[t * B]); auto* output_ptr = offsets.data_ptr() + 1; for (const auto i : c10::irange(B)) { cumsum += input_ptr[i]; @@ -1107,7 +1104,7 @@ std::vector stacked_jagged_2d_to_dense_cpu( AT_DISPATCH_INDEX_TYPES( lengths_contig.scalar_type(), "length_to_offset_cpu_kernel", [&] { index_t cumsum = 0; - const auto* input_ptr = &(lengths_contig.data_ptr()[static_cast(t * B]); + const auto* input_ptr = &(lengths_contig.data_ptr()[t * B]); auto* output_ptr = offsets.data_ptr() + 1; for (const auto i : c10::irange(B)) { cumsum += input_ptr[i]; @@ -1698,7 +1695,7 @@ Tensor get_source_mask_cpu( num_sources.scalar_type(), "get_source_mask_cpu", [&] { const index_t* num_sources_data = num_sources.data_ptr(); const index_t* num_targets_data = num_targets.data_ptr(); - bool* output_data = output.data_ptr(); + bool* output_data = output.mutable_data_ptr(); int64_t offset = 0; for (int64_t i = 0; i < batch_size; ++i) { diff --git a/fbgemm_gpu/src/layout_transform_ops/layout_transform_ops.cu b/fbgemm_gpu/src/layout_transform_ops/layout_transform_ops.cu index 20748a7f9b..e1b65068d4 100644 --- a/fbgemm_gpu/src/layout_transform_ops/layout_transform_ops.cu +++ b/fbgemm_gpu/src/layout_transform_ops/layout_transform_ops.cu @@ -154,8 +154,8 @@ Tensor recat_embedding_grad_output_mixed_D_batch_cuda( at::cuda::getCurrentCUDAStream(), dim_sum_per_rank.data_ptr(), cumsum_dim_sum_per_rank.data_ptr(), - grad_output.data_ptr(), - sharded_grad_output.data_ptr(), + grad_output.mutable_data_ptr(), + sharded_grad_output.mutable_data_ptr(), dim_num, B_local, dim_sum); diff --git a/fbgemm_gpu/src/metric_ops/metric_ops.cu b/fbgemm_gpu/src/metric_ops/metric_ops.cu index 4a3b0346c9..4c805196cb 100644 --- a/fbgemm_gpu/src/metric_ops/metric_ops.cu +++ b/fbgemm_gpu/src/metric_ops/metric_ops.cu @@ -265,7 +265,7 @@ at::Tensor batch_auc( dim3(NUM_THREADS_PER_BLOCK), \ 0, \ at::cuda::getCurrentCUDAStream(), \ - output.data_ptr(), \ + output.mutable_data_ptr(), \ indices.data_ptr(), \ labels.data_ptr(), \ weights.data_ptr(), \ diff --git a/fbgemm_gpu/src/permute_multi_embedding_ops/permute_multi_embedding_ops.cu b/fbgemm_gpu/src/permute_multi_embedding_ops/permute_multi_embedding_ops.cu index 5c4a8bc3bd..30695c298b 100644 --- a/fbgemm_gpu/src/permute_multi_embedding_ops/permute_multi_embedding_ops.cu +++ b/fbgemm_gpu/src/permute_multi_embedding_ops/permute_multi_embedding_ops.cu @@ -135,7 +135,7 @@ Tensor from_vec(const std::vector& input) { // Ensure that output is contiguous TORCH_CHECK(output.is_contiguous()); std::memcpy( - output.data_ptr(), input.data(), input.size() * sizeof(index_t)); + output.mutable_data_ptr(), input.data(), input.size() * sizeof(index_t)); return output; } @@ -295,7 +295,7 @@ std::vector permute_multi_embedding_function_gpu( 0, at::cuda::getCurrentCUDAStream(), reinterpret_cast(in_ptr.data_ptr()), - reinterpret_cast(out_ptr.data_ptr()), + reinterpret_cast(out_ptr.mutable_data_ptr()), PTA_B(permutes, int32_t, 2, 32), PTA_B(in_shapes, int32_t, 1, 32), PTA_B(out_shapes, int32_t, 1, 32), diff --git a/fbgemm_gpu/src/permute_multi_embedding_ops/permute_multi_embedding_ops_cpu.cpp b/fbgemm_gpu/src/permute_multi_embedding_ops/permute_multi_embedding_ops_cpu.cpp index 114bed4c78..7a587a0a47 100644 --- a/fbgemm_gpu/src/permute_multi_embedding_ops/permute_multi_embedding_ops_cpu.cpp +++ b/fbgemm_gpu/src/permute_multi_embedding_ops/permute_multi_embedding_ops_cpu.cpp @@ -195,7 +195,7 @@ Tensor from_cpu(const std::vector& input) { // Ensure that output is contiguous TORCH_CHECK(output.is_contiguous()); std::memcpy( - output.data_ptr(), input.data(), input.size() * sizeof(index_t)); + output.mutable_data_ptr(), input.data(), input.size() * sizeof(index_t)); return output; } diff --git a/fbgemm_gpu/src/quantize_ops/quantize_fused_8bit_rowwise.cu b/fbgemm_gpu/src/quantize_ops/quantize_fused_8bit_rowwise.cu index f2e85efb4e..b1ad10b8e1 100644 --- a/fbgemm_gpu/src/quantize_ops/quantize_fused_8bit_rowwise.cu +++ b/fbgemm_gpu/src/quantize_ops/quantize_fused_8bit_rowwise.cu @@ -279,7 +279,7 @@ Tensor _float_to_fused8bitrowwise_gpu_t(const Tensor& input) { input.data_ptr(), nrows, ncols, - output.data_ptr()); + output.mutable_data_ptr()); }); } else { // range_tensor is used to store the range for each embedding row. @@ -317,7 +317,7 @@ Tensor _float_to_fused8bitrowwise_gpu_t(const Tensor& input) { input.data_ptr(), nrows, ncols, - output.data_ptr(), + output.mutable_data_ptr(), range_tensor.data_ptr()); }); } @@ -341,7 +341,7 @@ Tensor _float_to_fused8bitrowwise_gpu_t(const Tensor& input) { range_tensor.data_ptr(), nrows, ncols, - output.data_ptr()); + output.mutable_data_ptr()); }); } } @@ -464,7 +464,7 @@ Tensor _fused8bitrowwise_to_float_gpu_t( input.data_ptr(), \ nrows, \ ncols, \ - output.data_ptr()) + output.mutable_data_ptr()) FBGEMM_DISPATCH_FLOATING_TYPES( output.scalar_type(), "fused8bitrowwise_to_float_cuda_kernel", [&] { diff --git a/fbgemm_gpu/src/quantize_ops/quantize_fused_nbit_rowwise.cu b/fbgemm_gpu/src/quantize_ops/quantize_fused_nbit_rowwise.cu index 456cadb923..8741fd79bd 100644 --- a/fbgemm_gpu/src/quantize_ops/quantize_fused_nbit_rowwise.cu +++ b/fbgemm_gpu/src/quantize_ops/quantize_fused_nbit_rowwise.cu @@ -159,7 +159,7 @@ Tensor _float_to_fusednbitrowwise_gpu_t( input.data_ptr(), nrows, ncols, - output.data_ptr()); + output.mutable_data_ptr()); }); return output; @@ -278,7 +278,7 @@ Tensor _fusednbitrowwise_to_float_gpu_t( input.data_ptr(), \ nrows, \ ncols, \ - output.data_ptr()) + output.mutable_data_ptr()) FBGEMM_DISPATCH_FLOATING_TYPES( output.scalar_type(), "fusednbitrowwise_to_float_cuda_kernel", [&] { diff --git a/fbgemm_gpu/src/quantize_ops/quantize_ops_cpu.cpp b/fbgemm_gpu/src/quantize_ops/quantize_ops_cpu.cpp index 65e93e474e..bf0c63da99 100644 --- a/fbgemm_gpu/src/quantize_ops/quantize_ops_cpu.cpp +++ b/fbgemm_gpu/src/quantize_ops/quantize_ops_cpu.cpp @@ -50,7 +50,7 @@ Tensor& _float_to_fused8bitrowwise_cpu_out_t( input.data_ptr()); // input.data_ptr(); -> Yields // unresolved data_ptr symbol. fbgemm::FloatOrHalfToFused8BitRowwiseQuantizedSBFloat( - input_data, nrows, ncols, output.data_ptr()); + input_data, nrows, ncols, output.mutable_data_ptr()); return output; } @@ -84,7 +84,7 @@ Tensor& _fused8bitrowwise_to_float_cpu_out_t( at::native::resize_(output, output_dims, std::nullopt); auto output_data = static_cast( - output.data_ptr()); // output.data_ptr(); -> Yields + output.mutable_data_ptr()); // output.mutable_data_ptr(); -> Yields // unresolved data_ptr symbol. fbgemm::Fused8BitRowwiseQuantizedSBFloatToFloatOrHalf< output_t, @@ -129,7 +129,7 @@ Tensor _float_to_fusednbitrowwise_cpu( input_data, nrows, ncols, - output.data_ptr(), + output.mutable_data_ptr(), rowwise_min_max); return output; @@ -162,7 +162,7 @@ Tensor _fusednbitrowwise_to_float_cpu( } auto output_data = static_cast( - output.data_ptr()); // output.data_ptr(); -> Yields + output.mutable_data_ptr()); // output.mutable_data_ptr(); -> Yields // unresolved data_ptr symbol. fbgemm::FusedNBitRowwiseQuantizedSBHalfToFloatOrHalf( @@ -209,7 +209,7 @@ Tensor _fusednbitrowwise_sbfront_to_float_or_half_cpu( using output_ty = std:: conditional_t, float, fbgemm::float16>; output_ty* output_data = static_cast( - output.data_ptr()); // output.data_ptr(); -> Yields + output.mutable_data_ptr()); // output.mutable_data_ptr(); -> Yields // unresolved data_ptr symbol. constexpr bool is_uint16_t_of_type_bf16 = @@ -569,7 +569,7 @@ at::Tensor _float_to_hfp8_cpu( input.data_ptr(), nrows, ncols, - output.data_ptr(), + output.mutable_data_ptr(), ebits, exponent_bias, max_pos); @@ -596,7 +596,7 @@ at::Tensor _hfp8_to_float_cpu( input.data_ptr(), nrows, ncols, - output.data_ptr(), + output.mutable_data_ptr(), ebits, exponent_bias); diff --git a/fbgemm_gpu/src/quantize_ops/quantize_padded_fp8_rowwise.cu b/fbgemm_gpu/src/quantize_ops/quantize_padded_fp8_rowwise.cu index d3913d7f3d..6df66042aa 100644 --- a/fbgemm_gpu/src/quantize_ops/quantize_padded_fp8_rowwise.cu +++ b/fbgemm_gpu/src/quantize_ops/quantize_padded_fp8_rowwise.cu @@ -248,7 +248,7 @@ Tensor _float_to_paddedFP8rowwise_gpu_t( input.data_ptr(), nrows, ncols, - output.data_ptr(), + output.mutable_data_ptr(), forward, row_dim); }); @@ -360,7 +360,7 @@ Tensor _paddedFP8rowwise_to_float_gpu_t( threads_per_block, 0, at::cuda::getCurrentCUDAStream(), - output.data_ptr(), + output.mutable_data_ptr(), input.data_ptr(), output_columns, row_dim, @@ -383,7 +383,7 @@ Tensor _paddedFP8rowwise_to_float_gpu_t( nrows, ncols, output_columns, - output.data_ptr(), + output.mutable_data_ptr(), forward, row_dim, offsets.data_ptr()); diff --git a/fbgemm_gpu/src/sparse_ops/sparse_async_cumsum.cpp b/fbgemm_gpu/src/sparse_ops/sparse_async_cumsum.cpp index 3cf7fd96ca..15eee33bca 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_async_cumsum.cpp +++ b/fbgemm_gpu/src/sparse_ops/sparse_async_cumsum.cpp @@ -73,7 +73,7 @@ Tensor asynchronous_inclusive_cumsum_cpu(const Tensor& t_in) { scalar_t cumsum = 0; const auto* input_ptr = t_in_contig->data_ptr(); const auto N = t_in_contig->numel(); - auto* output_ptr = output.data_ptr(); + auto* output_ptr = output.mutable_data_ptr(); for (const auto i : c10::irange(N)) { cumsum += input_ptr[i]; diff --git a/fbgemm_gpu/src/sparse_ops/sparse_batched_unary_embeddings.cu b/fbgemm_gpu/src/sparse_ops/sparse_batched_unary_embeddings.cu index 8fb9c65502..66c3229742 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_batched_unary_embeddings.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_batched_unary_embeddings.cu @@ -87,7 +87,7 @@ Tensor batched_unary_embeddings_forward_cuda( table_offsets.data_ptr(), offsets.data_ptr(), indices.data_ptr(), - output.data_ptr()); + output.mutable_data_ptr()); }); }); return output; @@ -237,7 +237,7 @@ DLL_PUBLIC Tensor batched_unary_embeddings_backward_cuda( N, B, T, - grad_output.data_ptr(), + grad_output.mutable_data_ptr(), table_offsets.data_ptr(), grad_weight.data_ptr(), PTA_B(sorted_linear_indices_run, index_t, 1, 32), diff --git a/fbgemm_gpu/src/sparse_ops/sparse_compute_frequency_sequence.cu b/fbgemm_gpu/src/sparse_ops/sparse_compute_frequency_sequence.cu index e8a420264f..6b51fee0ee 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_compute_frequency_sequence.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_compute_frequency_sequence.cu @@ -44,7 +44,7 @@ DLL_PUBLIC void compute_frequency_sequence( 0, at::cuda::getCurrentCUDAStream(), input.data_ptr(), - output.data_ptr(), + output.mutable_data_ptr(), start_input, input.numel()); }); diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp index ba58691e8d..02d78446f7 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp +++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp @@ -381,18 +381,18 @@ void _block_bucketize_sparse_features_cpu_kernel( const int32_t B = lengths_size / T; auto offsets = at::empty({lengths_size + 1}, lengths.options()); auto new_offsets = at::empty({new_lengths_size + 1}, lengths.options()); - const offset_t* lengths_data = lengths.data_ptr(); + const offset_t* lengths_data = lengths.const_data_ptr(); offset_t* offsets_data = offsets.data_ptr(); - const index_t* indices_data = indices.data_ptr(); + const index_t* indices_data = indices.const_data_ptr(); scalar_t* weights_data = nullptr; scalar_t* new_weights_data = nullptr; index_t* new_pos_data = nullptr; index_t* unbucketize_permute_data = nullptr; index_t* bag_mapping_data = nullptr; - offset_t* const new_lengths_data = new_lengths.data_ptr(); - offset_t* const new_offsets_data = new_offsets.data_ptr(); - index_t* const new_indices_data = new_indices.data_ptr(); - const index_t* const block_sizes_data = block_sizes.data_ptr(); + offset_t* const new_lengths_data = new_lengths.mutable_data_ptr(); + offset_t* const new_offsets_data = new_offsets.mutable_data_ptr(); + index_t* const new_indices_data = new_indices.mutable_data_ptr(); + const index_t* const block_sizes_data = block_sizes.const_data_ptr(); offset_t* batch_sizes_data = nullptr; const auto variable_batch_size = batch_size_per_feature.has_value(); const auto variable_bucket_sizes = @@ -600,7 +600,7 @@ at::Tensor _float_to_bfloat16_cpu(const at::Tensor& input) { FloatToBFloat16Quantized_ref( input.data_ptr(), input.numel(), - reinterpret_cast(output.data_ptr())); + reinterpret_cast(output.mutable_data_ptr())); return output; } @@ -616,7 +616,7 @@ at::Tensor _bfloat16_to_float_cpu(const at::Tensor& input) { BFloat16QuantizedToFloat_ref( reinterpret_cast(input.data_ptr()), input.numel(), - output.data_ptr()); + output.mutable_data_ptr()); return output; } @@ -648,16 +648,16 @@ void _bucketize_sparse_features_cpu( const auto new_lengths_size = lengths_size * my_size; auto offsets = at::empty({lengths_size + 1}, lengths.options()); auto new_offsets = at::empty({new_lengths_size + 1}, lengths.options()); - const index_t* lengths_data = lengths.data_ptr(); + const index_t* lengths_data = lengths.const_data_ptr(); index_t* offsets_data = offsets.data_ptr(); - const index_t* indices_data = indices.data_ptr(); + const index_t* indices_data = indices.const_data_ptr(); scalar_t* weights_data = nullptr; scalar_t* new_weights_data = nullptr; index_t* new_pos_data = nullptr; - index_t* const new_lengths_data = new_lengths.data_ptr(); - index_t* const new_offsets_data = new_offsets.data_ptr(); - index_t* const new_indices_data = new_indices.data_ptr(); + index_t* const new_lengths_data = new_lengths.mutable_data_ptr(); + index_t* const new_offsets_data = new_offsets.mutable_data_ptr(); + index_t* const new_indices_data = new_indices.mutable_data_ptr(); if (has_weight) { weights_data = weights.value().data_ptr(); @@ -1410,18 +1410,18 @@ void _block_bucketize_sparse_features_2d_weights_cpu_kernel( const int32_t B = lengths_size / T; auto offsets = at::empty({lengths_size + 1}, lengths.options()); auto new_offsets = at::empty({new_lengths_size + 1}, lengths.options()); - const offset_t* lengths_data = lengths.data_ptr(); + const offset_t* lengths_data = lengths.const_data_ptr(); offset_t* offsets_data = offsets.data_ptr(); - const index_t* indices_data = indices.data_ptr(); + const index_t* indices_data = indices.const_data_ptr(); scalar_t* weights_data = weights.data_ptr(); scalar_t* new_weights_data = new_weights.data_ptr(); index_t* new_pos_data = nullptr; index_t* unbucketize_permute_data = nullptr; index_t* bag_mapping_data = nullptr; - offset_t* const new_lengths_data = new_lengths.data_ptr(); - offset_t* const new_offsets_data = new_offsets.data_ptr(); - index_t* const new_indices_data = new_indices.data_ptr(); - const index_t* const block_sizes_data = block_sizes.data_ptr(); + offset_t* const new_lengths_data = new_lengths.mutable_data_ptr(); + offset_t* const new_offsets_data = new_offsets.mutable_data_ptr(); + index_t* const new_indices_data = new_indices.mutable_data_ptr(); + const index_t* const block_sizes_data = block_sizes.const_data_ptr(); offset_t* batch_sizes_data = nullptr; const auto variable_batch_size = batch_size_per_feature.has_value(); const auto variable_bucket_sizes = @@ -1820,9 +1820,9 @@ void reorder_batched_ad_lengths_( output_batch_size = max_batch_size; } - const auto* batch_offsets_data = batch_offsets.data_ptr(); - const auto* cat_ad_lengths_data = cat_ad_lengths.data_ptr(); - auto* output_data = output.data_ptr(); + const auto* batch_offsets_data = batch_offsets.const_data_ptr(); + const auto* cat_ad_lengths_data = cat_ad_lengths.const_data_ptr(); + auto* output_data = output.mutable_data_ptr(); at::parallel_for( 0, nB * nT, FALSE_SHARING_PAD, [&](int64_t tb_begin, int64_t tb_end) { auto b_begin = tb_begin / nT; @@ -1934,12 +1934,12 @@ void reorder_batched_ad_indices_cpu_( const int64_t nB = batch_offsets.numel() - 1; const int64_t nT = (reordered_cat_ad_offsets.numel() - 1) / num_ads_in_batch; - const auto* batch_offsets_data = batch_offsets.data_ptr(); - const auto* cat_ad_offsets_data = cat_ad_offsets.data_ptr(); + const auto* batch_offsets_data = batch_offsets.const_data_ptr(); + const auto* cat_ad_offsets_data = cat_ad_offsets.const_data_ptr(); const auto* reordered_cat_ad_offsets_data = reordered_cat_ad_offsets.data_ptr(); - const auto* cat_ad_indices_data = cat_ad_indices.data_ptr(); - auto* output_data = output.data_ptr(); + const auto* cat_ad_indices_data = cat_ad_indices.const_data_ptr(); + auto* output_data = output.mutable_data_ptr(); at::parallel_for( 0, nB * nT, FALSE_SHARING_PAD, [&](int64_t tb_begin, int64_t tb_end) { auto b_begin = tb_begin / nT; @@ -2002,17 +2002,17 @@ void cat_reorder_batched_ad_indices_cpu_( const int64_t nB = batch_offsets.numel() - 1; const int64_t nT = (reordered_cat_ad_offsets.numel() - 1) / max_batch_size; - const auto* batch_offsets_data = batch_offsets.data_ptr(); - const auto* cat_ad_offsets_data = cat_ad_offsets.data_ptr(); + const auto* batch_offsets_data = batch_offsets.const_data_ptr(); + const auto* cat_ad_offsets_data = cat_ad_offsets.const_data_ptr(); const auto* reordered_cat_ad_offsets_data = reordered_cat_ad_offsets.data_ptr(); - auto* output_data = output.data_ptr(); + auto* output_data = output.mutable_data_ptr(); at::parallel_for( 0, nB * nT, FALSE_SHARING_PAD, [&](int64_t tb_begin, int64_t tb_end) { auto b_begin = tb_begin / nT; auto b_end = (tb_end + nT - 1) / nT; for (auto b : c10::irange(b_begin, b_end)) { - const auto* ad_indices_data = ad_indices[b].data_ptr(); + const auto* ad_indices_data = ad_indices[b].const_data_ptr(); const auto num_ads_b = batch_offsets_data[b + 1] - batch_offsets_data[b]; int64_t t_begin = (b == b_begin) ? tb_begin % nT : 0; @@ -2069,14 +2069,14 @@ void reorder_batched_sequence_embeddings_cpu_( const int64_t nT = (reordered_cat_sequence_embeddings_offsets.numel() - 1) / num_items_in_batch; - const auto* batch_offsets_data = batch_offsets.data_ptr(); + const auto* batch_offsets_data = batch_offsets.const_data_ptr(); const auto* cat_sequence_embeddings_offsets_data = cat_sequence_embeddings_offsets.data_ptr(); const auto* reordered_cat_sequence_embeddings_offsets_data = reordered_cat_sequence_embeddings_offsets.data_ptr(); const auto* cat_sequence_embeddings_data = cat_sequence_embeddings.data_ptr(); - auto* output_data = output.data_ptr(); + auto* output_data = output.mutable_data_ptr(); at::parallel_for( 0, nB * nT, FALSE_SHARING_PAD, [&](int64_t tb_begin, int64_t tb_end) { auto b_begin = tb_begin / nT; @@ -2341,12 +2341,12 @@ Tensor batched_unary_embeddings_forward_cpu( AT_DISPATCH_INDEX_TYPES(table_offsets.scalar_type(), "unary_indices", [&] { FBGEMM_DISPATCH_FLOATING_TYPES( weight.scalar_type(), "batched_unary_embeddings_forward_cpu", [&] { - const index_t* table_offsets_data = table_offsets.data_ptr(); - const index_t* offsets_data = offsets.data_ptr(); - const index_t* indices_data = indices.data_ptr(); + const index_t* table_offsets_data = table_offsets.const_data_ptr(); + const index_t* offsets_data = offsets.const_data_ptr(); + const index_t* indices_data = indices.const_data_ptr(); const index_t sum_E = table_offsets_data[T]; - auto* output_data = output.data_ptr(); - const auto* weight_data = weight.data_ptr(); + auto* output_data = output.mutable_data_ptr(); + const auto* weight_data = weight.const_data_ptr(); for (const auto n : c10::irange(N)) { for (const auto b : c10::irange(B)) { @@ -2716,7 +2716,7 @@ Tensor segment_sum_csr_cpu( batch_size, csr_seg.data_ptr(), values.data_ptr(), - output.data_ptr()); + output.mutable_data_ptr()); }); }); return output; @@ -2867,7 +2867,7 @@ Tensor& lengths_range_out( AT_DISPATCH_INDEX_TYPES( t_in_contig->scalar_type(), "lengths_range_compute", [&]() { const auto* input_data = t_in_contig->data_ptr(); - auto* output_data = output.data_ptr(); + auto* output_data = output.mutable_data_ptr(); index_t offset = 0; for (const auto i : c10::irange(num_seq)) { @@ -3244,7 +3244,7 @@ Tensor pack_segments_forward_cpu( AT_DISPATCH_INDEX_TYPES( lengths.scalar_type(), "pack_segments_cpu", ([&]() { - const auto* const lengths_data = lengths.data_ptr(); + const auto* const lengths_data = lengths.const_data_ptr(); // Shape of output is batch_size x max_len x ... auto shape = t_in_cont->sizes().vec(); // Get copy of current shape @@ -3263,8 +3263,8 @@ Tensor pack_segments_forward_cpu( t_in_cont->sizes().slice(1, t_in_cont->sizes().size() - 1); const auto block_size = c10::multiply_integers(sizes); const auto block_bytesize = t_in_cont->itemsize() * block_size; - const auto* const data_ptr = t_in_cont->data_ptr(); - auto* const out_data = packed_tensor.data_ptr(); + const auto* const data_ptr = t_in_cont->const_data_ptr(); + auto* const out_data = packed_tensor.mutable_data_ptr(); int64_t start = 0; for (const auto i : c10::irange(lengths.sizes()[0])) { const auto len = @@ -3304,7 +3304,7 @@ std::tuple> pack_segments_forward_cpu_v2( AT_DISPATCH_INDEX_TYPES( lengths.scalar_type(), "pack_segments_cpu", ([&]() { - const auto* const lengths_data = lengths.data_ptr(); + const auto* const lengths_data = lengths.const_data_ptr(); // Shape of output is batch_size x max_len x ... auto shape = t_in_cont->sizes().vec(); // Get copy of current shape @@ -3338,8 +3338,8 @@ std::tuple> pack_segments_forward_cpu_v2( t_in_cont->sizes().slice(1, t_in_cont->sizes().size() - 1); const auto block_size = c10::multiply_integers(sizes); const auto block_bytesize = t_in_cont->itemsize() * block_size; - const auto* const data_ptr = t_in_cont->data_ptr(); - auto* const out_data = packed_tensor.data_ptr(); + const auto* const data_ptr = t_in_cont->const_data_ptr(); + auto* const out_data = packed_tensor.mutable_data_ptr(); int64_t start = 0; for (const auto i : c10::irange(lengths.sizes()[0])) { const auto len = @@ -3417,7 +3417,7 @@ Tensor pack_segments_backward_cpu( const auto block_size = c10::multiply_integers(sizes); const auto block_bytesize = data.itemsize() * block_size; const auto* const data_ptr = data_contig->data_ptr(); - auto* const out_data = unpacked_tensor.data_ptr(); + auto* const out_data = unpacked_tensor.mutable_data_ptr(); int64_t start = 0; for (const auto i : c10::irange(lengths.sizes()[0])) { diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp index 36f5d152f5..3e0d2e2518 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp +++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp @@ -92,11 +92,11 @@ class LookupFunctionBatchedUnaryEmbeddingOp // the batched_unary_embeddings_backward_cuda assumes contiguous inputs. // may cause illegal memory access when it is not auto grad_output = grad_outputs[0]; - if (reinterpret_cast(grad_output.data_ptr()) % 16 != 0 || + if (reinterpret_cast(grad_output.mutable_data_ptr()) % 16 != 0 || grad_output.stride(1) != 1 || grad_output.stride(0) % 4 != 0) { grad_output = grad_output.contiguous(); } - if (reinterpret_cast(grad_output.data_ptr()) % 16 != 0) { + if (reinterpret_cast(grad_output.mutable_data_ptr()) % 16 != 0) { grad_output = at::empty_like(grad_output).copy_(grad_output); } auto grad_weight = batched_unary_embeddings_backward_cuda( @@ -346,7 +346,7 @@ static torch::autograd::variable_list group_index_select_dim0_forward_impl_gpu( // Store args input_ptrs[i] = reinterpret_cast(input_contigs[i]->data_ptr()); - output_ptrs[i] = reinterpret_cast(output.data_ptr()); + output_ptrs[i] = reinterpret_cast(output.mutable_data_ptr()); indices_ptrs[i] = reinterpret_cast(index_contigs[i]->data_ptr()); warp_offsets_group[i] = warp_offset; num_cols_group[i] = num_cols_; diff --git a/fbgemm_gpu/src/sparse_ops/sparse_pack_segments_backward.cu b/fbgemm_gpu/src/sparse_ops/sparse_pack_segments_backward.cu index a073a9e8e1..4af1705032 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_pack_segments_backward.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_pack_segments_backward.cu @@ -67,7 +67,7 @@ DLL_PUBLIC Tensor pack_segments_backward_cuda( Tensor unpacked_tensor; // The output tensor AT_DISPATCH_INDEX_TYPES(lengths.scalar_type(), "unpack_segments_cuda", [&] { - const auto* const lengths_data = lengths.data_ptr(); + const auto* const lengths_data = lengths.const_data_ptr(); // Create output tensor of appropriate dimensions auto shape = data_contig->sizes().vec(); @@ -90,7 +90,7 @@ DLL_PUBLIC Tensor pack_segments_backward_cuda( const auto cell_size = data_contig->numel() / (data_contig->size(0) * data_contig->size(1)); const auto* const data_ptr = data_contig->data_ptr(); - auto* const out_data = unpacked_tensor.data_ptr(); + auto* const out_data = unpacked_tensor.mutable_data_ptr(); FBGEMM_LAUNCH_KERNEL( (unpack_segments_cuda_kernel), diff --git a/fbgemm_gpu/src/sparse_ops/sparse_pack_segments_forward.cu b/fbgemm_gpu/src/sparse_ops/sparse_pack_segments_forward.cu index b886b630de..dcc65ad0a4 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_pack_segments_forward.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_pack_segments_forward.cu @@ -109,7 +109,7 @@ DLL_PUBLIC Tensor pack_segments_forward_cuda( Tensor packed_tensor; AT_DISPATCH_INDEX_TYPES(lengths.scalar_type(), "pack_segments_cuda", [&] { - const auto* const lengths_data = lengths.data_ptr(); + const auto* const lengths_data = lengths.const_data_ptr(); // Shape of output is batch_size x max_len x ... auto shape = t_in_c.sizes().vec(); // Get copy of current shape @@ -128,8 +128,8 @@ DLL_PUBLIC Tensor pack_segments_forward_cuda( FBGEMM_DISPATCH_ALL_TYPES( t_in_c.scalar_type(), "pack_segments_cuda-packing", [&] { - const auto* const data_ptr = t_in_c.data_ptr(); - auto* const out_data = packed_tensor.data_ptr(); + const auto* const data_ptr = t_in_c.const_data_ptr(); + auto* const out_data = packed_tensor.mutable_data_ptr(); const auto num_seq = lengths.size(0); const auto cell_size = t_in_c.numel() / t_in_c.size(0); @@ -191,7 +191,7 @@ pack_segments_forward_cuda_v2( std::optional presence_mask; AT_DISPATCH_INDEX_TYPES(lengths.scalar_type(), "pack_segments_cuda", [&] { - const auto* const lengths_data = lengths.data_ptr(); + const auto* const lengths_data = lengths.const_data_ptr(); // Shape of output is batch_size x max_len x ... auto shape = t_in_c.sizes().vec(); // Get copy of current shape @@ -226,8 +226,8 @@ pack_segments_forward_cuda_v2( FBGEMM_DISPATCH_ALL_TYPES( t_in_c.scalar_type(), "pack_segments_cuda-packing", [&] { - const auto* const data_ptr = t_in_c.data_ptr(); - auto* const out_data = packed_tensor.data_ptr(); + const auto* const data_ptr = t_in_c.const_data_ptr(); + auto* const out_data = packed_tensor.mutable_data_ptr(); const auto num_seq = lengths.size(0); const auto cell_size = t_in_c.numel() / t_in_c.size(0); diff --git a/fbgemm_gpu/src/sparse_ops/sparse_range.cu b/fbgemm_gpu/src/sparse_ops/sparse_range.cu index f6d2492989..2ba3772056 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_range.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_range.cu @@ -160,7 +160,7 @@ DLL_PUBLIC Tensor lengths_range_cuda( num_seq, output_size, offsets.data_ptr(), - output.data_ptr()); + output.mutable_data_ptr()); }); return output; diff --git a/fbgemm_gpu/src/sparse_ops/sparse_segment_sum_csr.cu b/fbgemm_gpu/src/sparse_ops/sparse_segment_sum_csr.cu index 5cc8a212f3..8a4f3081dc 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_segment_sum_csr.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_segment_sum_csr.cu @@ -83,7 +83,7 @@ DLL_PUBLIC Tensor segment_sum_csr_cuda( batch_size, csr_seg.data_ptr(), values.data_ptr(), - output.data_ptr()); + output.mutable_data_ptr()); }); }); diff --git a/fbgemm_gpu/src/split_embeddings_cache/cachelib_cache.cpp b/fbgemm_gpu/src/split_embeddings_cache/cachelib_cache.cpp index fd38ca1c89..ec426fa5ee 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/cachelib_cache.cpp +++ b/fbgemm_gpu/src/split_embeddings_cache/cachelib_cache.cpp @@ -116,7 +116,7 @@ folly::Optional CacheLibCache::get(const at::Tensor& key_tensor) { folly::Optional res; FBGEMM_DISPATCH_INTEGRAL_TYPES(key_tensor.scalar_type(), "get", [&] { using index_t = scalar_t; - auto key = *(key_tensor.data_ptr()); + auto key = *(key_tensor.const_data_ptr()); auto key_str = folly::StringPiece( reinterpret_cast(&key), sizeof(index_t)); auto item = cache_->find(key_str); diff --git a/fbgemm_gpu/src/tbe/eeg/indices_estimator.cpp b/fbgemm_gpu/src/tbe/eeg/indices_estimator.cpp index c6509e18fe..b2f2ade0b9 100644 --- a/fbgemm_gpu/src/tbe/eeg/indices_estimator.cpp +++ b/fbgemm_gpu/src/tbe/eeg/indices_estimator.cpp @@ -18,7 +18,7 @@ namespace fbgemm_gpu::tbe { void IndicesEstimator::populateIndexFreqs_(const torch::Tensor& indices) { // Count the frequency of indices - const auto* data = indices.data_ptr(); + const auto* data = indices.const_data_ptr(); for (auto i = 0; i < indices.numel(); i++) { const auto idx = data[i]; indexCounts_[idx] += 1;