From e92c0bf68e45f8a03ac272f8fbb79a3035e12cca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Mon, 4 Aug 2025 15:34:35 +0000 Subject: [PATCH] Initial integaration of packed cast. --- .../gridwise_gemm_xdl_cshuffle_conv_v3.hpp | 30 +++++++++++++++++-- .../tensor_operation/gpu/grid/packed_cast.hpp | 17 +++++++++++ 2 files changed, 45 insertions(+), 2 deletions(-) create mode 100644 include/ck/tensor_operation/gpu/grid/packed_cast.hpp diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp index 68112489ca..4c76035112 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp @@ -8,6 +8,7 @@ #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp" +#include "ck/tensor_operation/gpu/grid/packed_cast.hpp" #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_selector.hpp" #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp" #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp" @@ -100,6 +101,21 @@ struct GridwiseGemm_xdl_cshuffle_conv_v3 using ThisThreadBlock = ThisThreadBlock; + // gfx950 specific optimizations for BF16 inputs +#if defined(__gfx950__) + static constexpr bool is_gfx950_and_bf16_input_ = + std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v; +#else + static constexpr bool is_gfx950_and_bf16_input_ = false; +#endif + + using CShuffleInputDataType = std::conditional_t; + __host__ static auto CalculateGridSize(index_t M, index_t N, index_t KBatch, index_t Batch) { return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), KBatch, Batch); @@ -884,7 +900,7 @@ struct GridwiseGemm_xdl_cshuffle_conv_v3 // shuffle: threadwise copy C from VGPR to LDS auto c_thread_copy_vgpr_to_lds = - ThreadwiseTensorSliceTransfer_v1r3{}([&](auto access_id) { // make sure it's safe to write to LDS block_sync_lds(); + + if constexpr (is_gfx950_and_bf16_input_) + { + packed_cast(sfc_c_vgpr); + } // each thread write its data from VGPR to LDS c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2, diff --git a/include/ck/tensor_operation/gpu/grid/packed_cast.hpp b/include/ck/tensor_operation/gpu/grid/packed_cast.hpp new file mode 100644 index 0000000000..f0b226aaae --- /dev/null +++ b/include/ck/tensor_operation/gpu/grid/packed_cast.hpp @@ -0,0 +1,17 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/data_type.hpp" +#include "ck/utility/type_convert.hpp" +#include "ck/host_utility/hip_check_error.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck { + + __host__ __device__ inline void packed_cast(const auto& sfc_c_vgpr) + { + // This function is a placeholder for packed cast operations. + // For now, it does nothing. + }; +}