Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,9 @@ void pruned_hashmap_insert_{{ wdesc }}_cpu(
using uidx_t =
std::conditional_t<std::is_same_v<index_t, int64_t>, uint64_t, uint32_t>;

const auto* indices_acc = indices.data_ptr<index_t>();
const auto* dense_indices_acc = dense_indices.data_ptr<index_t>();
const auto* offsets_acc = offsets.data_ptr<index_t>();
const auto* indices_acc = indices.const_data_ptr<index_t>();
const auto* dense_indices_acc = dense_indices.const_data_ptr<index_t>();
const auto* offsets_acc = offsets.const_data_ptr<index_t>();

auto hash_table_acc = hash_table.accessor<hash_t, 2>();
const auto hash_table_offsets_acc = hash_table_offsets.accessor<int64_t, 1>();
Expand Down Expand Up @@ -228,14 +228,14 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
return output;
}

const int32_t* weights_placements_ptr = weights_placements.data_ptr<int32_t>();
const int32_t* weights_placements_ptr = weights_placements.const_data_ptr<int32_t>();
const uint8_t* weights_acc;

const auto* weights_tys_acc = weights_tys.data_ptr<uint8_t>();
const auto* weights_tys_acc = weights_tys.const_data_ptr<uint8_t>();

DISPATCH_OUTPUT_TYPES(output.scalar_type(), "intn_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", [&] {
{% if weighted %}
const float* indice_weights_acc = indice_weights.data_ptr<float>();
const float* indice_weights_acc = indice_weights.const_data_ptr<float>();
{% endif %}

using float16 = uint16_t;
Expand All @@ -250,15 +250,15 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
float16,
std::conditional<std::is_same<output_t, at::BFloat16>::value, bfloat16, float>::type> ::type;
AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_", [&] {
const auto* indices_acc = indices.data_ptr<index_t>();
const auto* offsets_acc = offsets.data_ptr<index_t>();
const auto* weights_offsets_acc = weights_offsets.data_ptr<int64_t>();
const auto* indices_acc = indices.const_data_ptr<index_t>();
const auto* offsets_acc = offsets.const_data_ptr<index_t>();
const auto* weights_offsets_acc = weights_offsets.const_data_ptr<int64_t>();

auto* output_acc = output.data_ptr<output_t>();
auto* output_acc = output.mutable_data_ptr<output_t>();

for (const auto t : c10::irange(T)) {
{% if not nobag %}
const auto* D_offsets_acc = D_offsets.data_ptr<int32_t>();
const auto* D_offsets_acc = D_offsets.const_data_ptr<int32_t>();
const int32_t D_start = D_offsets_acc[t];
const int32_t D_end = D_offsets_acc[t + 1];
const int32_t D = D_end - D_start;
Expand Down Expand Up @@ -295,7 +295,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
{% if nobag %}
// Create virtual offsets for the nobag case. Lengths are all ones.
const auto offsets_nobag = at::arange(*offsets_begin_ptr, offsets_acc[(t + 1) * B] + 1, offsets.options());
const index_t* offsets_nobag_ptr = offsets_nobag.data_ptr<index_t>();
const index_t* offsets_nobag_ptr = offsets_nobag.const_data_ptr<index_t>();
TORCH_CHECK(offsets_nobag.numel() == index_size + 1);
TORCH_CHECK(offsets_nobag_ptr[index_size] - offsets_nobag_ptr[0] == index_size);
{% endif %}
Expand Down Expand Up @@ -449,9 +449,9 @@ Tensor pruned_hashmap_lookup_{{ wdesc }}_cpu(
using utdx_t =
std::conditional_t<std::is_same_v<index_t, int64_t>, uint64_t, uint32_t>;

const auto* indices_acc = indices.data_ptr<index_t>();
const auto* indices_acc = indices.const_data_ptr<index_t>();
auto* dense_indices_acc = dense_indices.data_ptr<index_t>();
const auto* offsets_acc = offsets.data_ptr<index_t>();
const auto* offsets_acc = offsets.const_data_ptr<index_t>();

const auto hash_table_acc = hash_table.accessor<hash_t, 2>();
const auto hash_table_offsets_acc = hash_table_offsets.accessor<int64_t, 1>();
Expand Down Expand Up @@ -527,12 +527,12 @@ Tensor pruned_array_lookup_cpu(
using remap_t = index_t;

AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "pruned_array_lookup_cpu_1", [&] {
const auto* indices_acc = indices.data_ptr<index_t>();
const auto* indices_acc = indices.const_data_ptr<index_t>();
auto* dense_indices_acc = dense_indices.data_ptr<index_t>();
const auto* offsets_acc = offsets.data_ptr<index_t>();
const auto* offsets_acc = offsets.const_data_ptr<index_t>();

const auto index_remappings_acc = index_remappings.data_ptr<remap_t>();
const auto index_remappings_offsets_acc = index_remappings_offsets.data_ptr<int64_t>();
const auto index_remappings_acc = index_remappings.const_data_ptr<remap_t>();
const auto index_remappings_offsets_acc = index_remappings_offsets.const_data_ptr<int64_t>();

at::parallel_for(0, T, 1, [&](int64_t begin, int64_t end) {
for (const auto t : c10::irange(begin, end)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -412,9 +412,9 @@ class PrunedMapCPU : public torch::jit::CustomClassHolder {
void insert(Tensor indices, Tensor dense_indices, Tensor offsets, int64_t T) {
int32_t B = (offsets.size(0) - 1) / T;
TORCH_CHECK(B > 0);
const auto* indices_acc = indices.data_ptr<int32_t>();
const auto* indices_acc = indices.const_data_ptr<int32_t>();
auto* dense_indices_acc = dense_indices.data_ptr<int32_t>();
const auto* offsets_acc = offsets.data_ptr<int32_t>();
const auto* offsets_acc = offsets.const_data_ptr<int32_t>();
maps_.resize(T);
for (const auto t : c10::irange(T)) {
auto& map = maps_[t];
Expand Down Expand Up @@ -447,9 +447,9 @@ class PrunedMapCPU : public torch::jit::CustomClassHolder {
auto dense_indices = empty_like(indices);

AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "PrunedMapCPU::lookup", [&] {
const auto* indices_acc = indices.data_ptr<index_t>();
const auto* indices_acc = indices.const_data_ptr<index_t>();
auto* dense_indices_acc = dense_indices.data_ptr<index_t>();
const auto* offsets_acc = offsets.data_ptr<index_t>();
const auto* offsets_acc = offsets.const_data_ptr<index_t>();

for (const auto t : c10::irange(T)) {
auto& map = maps_[t];
Expand Down Expand Up @@ -563,7 +563,7 @@ struct TensorQueue : torch::CustomClassHolder {
const std::string key = "queue";
Tensor size_tensor;
size_tensor = dict.at(std::string(key + "/size")).cpu();
const auto* size_tensor_acc = size_tensor.data_ptr<int64_t>();
const auto* size_tensor_acc = size_tensor.const_data_ptr<int64_t>();
int64_t queue_size = size_tensor_acc[0];

for (const auto index : c10::irange(queue_size)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,11 +145,11 @@ split_embedding_backward_codegen_{{ optimizer }}_cpu(
AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "split_embedding_backward_approx_cpu_kernel_1", [&] {

auto grad_stride = grad_output.size(1);
const float* grad_output_data = grad_output.data_ptr<float>();
const float* grad_output_data = grad_output.const_data_ptr<float>();
float* host_weights_data = host_weights.data_ptr<float>();

const auto* indices_data = indices.data_ptr<index_t>();
const auto* offsets_data = offsets.data_ptr<index_t>();
const auto* indices_data = indices.const_data_ptr<index_t>();
const auto* offsets_data = offsets.const_data_ptr<index_t>();

const auto hash_size_cumsum_data = hash_size_cumsum.accessor<int64_t, 1>();
float* momentum1_data = momentum1_host.data_ptr<float>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ void split_embedding_backward_exact_cpu_kernel(
const at::TensorAccessor<int64_t, 1> momentum2_offsets_data,
{% endif %}
{{ args.split_cpu_kernel_args | join(", ") }}) {
const grad_t* grad_output_data = grad_output.data_ptr<grad_t>();
const grad_t* grad_output_data = grad_output.const_data_ptr<grad_t>();
auto host_weights_data = host_weights.accessor<scalar_t, 1>();
const auto hash_size_cumsum_data = hash_size_cumsum.accessor<int64_t, 1>();

Expand Down Expand Up @@ -252,12 +252,12 @@ void split_embedding_nobag_backward_exact_cpu_kernel(
const at::TensorAccessor<int64_t, 1> momentum2_offsets_data,
{% endif %}
{{ args.split_cpu_kernel_args | join(", ") }}) {
const grad_t* grad_output_data = grad_output.data_ptr<grad_t>();
const grad_t* grad_output_data = grad_output.const_data_ptr<grad_t>();
auto host_weights_data = host_weights.accessor<scalar_t, 1>();
const auto hash_size_cumsum_data = hash_size_cumsum.accessor<int64_t, 1>();
const auto indices_data = indices.data_ptr<index_t>();
const auto offsets_data = offsets.data_ptr<index_t>();
const auto weights_offsets_data = weights_offsets.data_ptr<int64_t>();
const auto indices_data = indices.const_data_ptr<index_t>();
const auto offsets_data = offsets.const_data_ptr<index_t>();
const auto weights_offsets_data = weights_offsets.const_data_ptr<int64_t>();

typedef std::unordered_map<int64_t, std::vector<at::acc_type<grad_t, true>>> tb_grad_buffer_map_t;
typedef std::unordered_map<int64_t, int64_t> tb_fb_map_t;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,18 +67,18 @@ void split_embedding_forward_cpu_kernel(

const auto D_offsets_data = D_offsets.accessor<int, 1>();
const auto weights_offsets_data = weights_offsets.accessor<int64_t, 1>();
const auto indices_data = indices.data_ptr<index_t>();
const auto offsets_data = offsets.data_ptr<offset_t>();
const auto indices_data = indices.const_data_ptr<index_t>();
const auto offsets_data = offsets.const_data_ptr<offset_t>();
const auto hash_size_cumsum_data = hash_size_cumsum.accessor<int64_t, 1>();

const auto weights_data = weights.data_ptr<weights_t>();
const auto weights_data = weights.const_data_ptr<weights_t>();
// If indice_weights not defined, then this accessor won't be used.
// The else condition is just to make compiler happy
const auto indice_weights_data = indice_weights.defined()
? indice_weights.data_ptr<ind_weights_t>()
: nullptr;

auto output_data = output.data_ptr<output_t>();
auto output_data = output.mutable_data_ptr<output_t>();
auto output_stride = output.size(1);

constexpr bool use_fbgemm = (std::is_same<weights_t, float>::value ||
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@ void split_embedding_nobag_codegen_forward_cpu_kernel(

const auto weights_offsets_data = weights_offsets.accessor<int64_t, 1>();
const auto hash_size_cumsum_data = hash_size_cumsum.accessor<int64_t, 1>();
const auto indices_data = indices.data_ptr<index_t>();
const auto offsets_data = offsets.data_ptr<offset_t>();
const auto weights_data = weights.data_ptr<weights_t>();
auto output_data = output.data_ptr<output_t>();
const auto indices_data = indices.const_data_ptr<index_t>();
const auto offsets_data = offsets.const_data_ptr<offset_t>();
const auto weights_data = weights.const_data_ptr<weights_t>();
auto output_data = output.mutable_data_ptr<output_t>();

int64_t T = weights_offsets.size(0);
int64_t B = (offsets.size(0) - 1) / T;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -846,7 +846,7 @@ batch_index_select_dim0_codegen_forward_cuda(
reinterpret_cast<uint32_t*>(D_offsets.data_ptr<int32_t>()),
weights_offsets.data_ptr<int64_t>(),
lxu_cache_locations.data_ptr<int32_t>(),
output.data_ptr<output_t>()
output.mutable_data_ptr<output_t>()
);
}
{%- endif %} // if has_experimental
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ Tensor tensor_from_vec(const std::vector<int64_t>& vec) {
std::vector<int64_t> vecref_from_tensor(const Tensor& t) {
TORCH_CHECK(t.is_contiguous());
const auto numel = static_cast<size_t>(t.numel());
const auto* ptr = t.data_ptr<int64_t>();
const auto* ptr = t.const_data_ptr<int64_t>();
return std::vector(ptr, ptr + numel);
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ class BatchIndexSelectDim0GPUOp
auto _grad_output = grad_output;
// FIXME: to support aligned memory access in Vec4T load/store function
// 16 for FP32 and 8 for FP16
if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0 ||
if (reinterpret_cast<uint64_t>(grad_output.mutable_data_ptr()) % 16 != 0 ||
at::has_internal_overlap(grad_output) != at::MemOverlap::No) {
_grad_output = at::empty_like(grad_output).copy_(grad_output);
}
Expand Down Expand Up @@ -612,7 +612,7 @@ class BatchIndexSelectDim0TensorGPUOp
auto _grad_output = grad_output;
// FIXME: to support aligned memory access in Vec4T load/store function
// 16 for FP32 and 8 for FP16
if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0 ||
if (reinterpret_cast<uint64_t>(grad_output.mutable_data_ptr()) % 16 != 0 ||
at::has_internal_overlap(grad_output) != at::MemOverlap::No) {
_grad_output = at::empty_like(grad_output).copy_(grad_output);
}
Expand Down
6 changes: 3 additions & 3 deletions fbgemm_gpu/experimental/example/src/cutlass_sgemm_nn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ at::Tensor sgemm_float_cuda(
const auto ldb = N;
const auto ldc = N;

const auto* A = TA.data_ptr<float>();
const auto* B = TB.data_ptr<float>();
const auto* C = TC.data_ptr<float>();
const auto* A = TA.const_data_ptr<float>();
const auto* B = TB.const_data_ptr<float>();
const auto* C = TC.const_data_ptr<float>();

const auto alpha = static_cast<float>(alpha_);
const auto beta = static_cast<float>(beta_);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -270,11 +270,11 @@ struct GenRunner {

typename Operation::Arguments arguments{
problem_shape,
static_cast<const int*>(seqlen_kv.data_ptr()),
static_cast<const int*>(batch_idx ? batch_idx.value().data_ptr() : nullptr),
static_cast<const int*>(seqlen_kv.const_data_ptr()),
static_cast<const int*>(batch_idx ? batch_idx.value().const_data_ptr() : nullptr),
static_cast<int>(split_k_size),
static_cast<int>(window_size),
static_cast<const Element*>(q.data_ptr()),
static_cast<const Element*>(q.const_data_ptr()),
stride_q,
static_cast<const Element*>(nullptr), // ptr_new_k
stride_new_k,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -757,7 +757,7 @@ at::Tensor quantize_qkv_per_head(
dim3 block_size(kThreadsPerWarp, kWarpsPerBlock);
dim3 grid_size(kMaxBlocks);
auto scale_q = at::zeros({B, N_KVH_L}, XQ_O.options().dtype(at::kFloat));
float* const scale_q_ptr = scale_q.data_ptr<float>();
float* const scale_q_ptr = scale_q.const_data_ptr<float>();
// Launch the kernel

FBGEMM_LAUNCH_KERNEL(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ at::Tensor f8f8bf16_cublas(
sizeof(fastAccuMode)));

if (Ainvs.has_value()) {
const float* Ainvs_pt = Ainvs.value().data_ptr<float>();
const float* Ainvs_pt = Ainvs.value().const_data_ptr<float>();
checkCublasStatus(cublasLtMatmulDescSetAttribute(
operationDesc,
CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
Expand All @@ -114,7 +114,7 @@ at::Tensor f8f8bf16_cublas(
}

if (Binvs.has_value()) {
const float* Binvs_pt = Binvs.value().data_ptr<float>();
const float* Binvs_pt = Binvs.value().const_data_ptr<float>();
checkCublasStatus(cublasLtMatmulDescSetAttribute(
operationDesc,
CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ bf16fp8bf16_fast_gemv(at::Tensor X, at::Tensor W, at::Tensor w_scale) {
k,
m,
n,
reinterpret_cast<float const*>(w_scale.data_ptr()),
reinterpret_cast<float const*>(w_scale.const_data_ptr()),
num_per_thread);

C10_CUDA_KERNEL_LAUNCH_CHECK();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,8 @@ at::Tensor fp8fp8bf16_fast_gemv(
k,
m,
n,
reinterpret_cast<float const*>(w_scale.data_ptr()),
reinterpret_cast<float const*>(x_scale.data_ptr()));
reinterpret_cast<float const*>(w_scale.const_data_ptr()),
reinterpret_cast<float const*>(x_scale.const_data_ptr()));

if (!dispatched) {
throw std::runtime_error("f8f8bf16_fast_gemv cannot run.");
Expand Down
Loading
Loading