mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-17 03:19:48 +00:00
Merge commit '254bce934626502ecc043ff1ffb8e9609d8299d6' into develop
This commit is contained in:
@@ -19,6 +19,12 @@ template <> struct typeToStr<fp8_t> { static constexpr const char * name = "fp8"
|
||||
template <> struct typeToStr<bf8_t> { static constexpr const char * name = "bf8"; };
|
||||
template <> struct typeToStr<int8_t> { static constexpr const char * name = "int8"; };
|
||||
template <> struct typeToStr<pk_int4_t> { static constexpr const char * name = "pk_int4"; };
|
||||
|
||||
template <memory_operation_enum MemOp> struct memOpToStr;
|
||||
template <> struct memOpToStr<memory_operation_enum::set> { static constexpr const char * name = "set"; };
|
||||
template <> struct memOpToStr<memory_operation_enum::atomic_add> { static constexpr const char * name = "atomic_add"; };
|
||||
template <> struct memOpToStr<memory_operation_enum::atomic_max> { static constexpr const char * name = "atomic_max"; };
|
||||
template <> struct memOpToStr<memory_operation_enum::add> { static constexpr const char * name = "add"; };
|
||||
// clang-format on
|
||||
|
||||
template <typename ADataType_, typename BDataType_>
|
||||
@@ -32,4 +38,10 @@ std::string gemm_prec_str()
|
||||
return base_str;
|
||||
}
|
||||
|
||||
template <memory_operation_enum MemOp_>
|
||||
std::string mem_op_string()
|
||||
{
|
||||
return std::string(memOpToStr<MemOp_>::name);
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
|
||||
@@ -454,11 +454,8 @@ struct PassThrough
|
||||
}
|
||||
|
||||
template <typename E, typename C, typename... Ds>
|
||||
CK_TILE_HOST_DEVICE auto operator()(E& e, const C& c, const Ds&... ds) const -> void
|
||||
CK_TILE_HOST_DEVICE auto operator()(E& e, const C& c, const Ds&...) const -> void
|
||||
{
|
||||
// Suppress unused parameter warning for ds
|
||||
((void)ds, ...);
|
||||
|
||||
// Just assign e with c
|
||||
if constexpr(std::is_same_v<E, C>)
|
||||
{
|
||||
|
||||
@@ -3,7 +3,9 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/host/concat.hpp"
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/ops/common/utils.hpp"
|
||||
#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
|
||||
#include "ck_tile/ops/common/tensor_layout.hpp"
|
||||
#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
|
||||
@@ -123,6 +125,19 @@ struct CShuffleEpilogue
|
||||
|
||||
static_assert(NumDTensor == DsLayout::size(),
|
||||
"The size of DsDataType and DsLayout should be the same");
|
||||
|
||||
[[nodiscard]] CK_TILE_HOST static const std::string GetName()
|
||||
{
|
||||
// clang-format off
|
||||
return concat('_', "CShuffleEpilogue",
|
||||
concat('x', MWave, NWave),
|
||||
concat('x', MPerXdl, NPerXdl, KPerXdl),
|
||||
VectorSizeC,
|
||||
isCTransposed ? "CTransposed" : "CNotTransposed",
|
||||
mem_op_string<MemoryOperation>());
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Get the vector store size for C tensor.
|
||||
*
|
||||
|
||||
@@ -333,8 +333,8 @@ struct BlockUniversalGemmAsBsCr
|
||||
bool ALoadTranspose = false,
|
||||
bool BLoadTranspose = false>
|
||||
CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
|
||||
[[maybe_unused]] ASmemBlockWindow& a_block_window,
|
||||
[[maybe_unused]] BSmemBlockWindow& b_block_window,
|
||||
const ASmemBlockWindow&,
|
||||
const BSmemBlockWindow&,
|
||||
bool_constant<ALoadTranspose> = {},
|
||||
bool_constant<BLoadTranspose> = {})
|
||||
{
|
||||
|
||||
@@ -3,14 +3,10 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
|
||||
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
|
||||
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
|
||||
#include "ck_tile/host/concat.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
@@ -25,8 +21,6 @@ struct BaseGemmPipelineAgBgCrCompV3
|
||||
static constexpr index_t GlobalBufferNum = 1;
|
||||
static constexpr bool UsePersistentKernel = Problem::Traits::UsePersistentKernel;
|
||||
|
||||
CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }
|
||||
|
||||
CK_TILE_HOST_DEVICE static constexpr bool BlockHasHotloop(index_t num_loop)
|
||||
{
|
||||
return num_loop > PrefetchStages;
|
||||
|
||||
@@ -484,7 +484,7 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
|
||||
elementwise_Bs_res = load_tile_with_elementwise(b_tile_windows, b_element_func);
|
||||
move_tile_window(b_tile_windows, b_dram_tile_window_step);
|
||||
|
||||
if(HasHotLoop)
|
||||
if constexpr(HasHotLoop)
|
||||
{
|
||||
// minus 2 because we have ping-pong double buffer.
|
||||
index_t iCounter = amd_wave_read_first_lane(num_loop - 2);
|
||||
@@ -529,7 +529,6 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
|
||||
// gemm
|
||||
block_gemm(c_block_tile, a_block_tile0, b_block_tile0);
|
||||
HotLoopScheduler();
|
||||
__builtin_amdgcn_sched_barrier(0);
|
||||
}
|
||||
// pong
|
||||
{
|
||||
@@ -572,7 +571,6 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
|
||||
// gemm
|
||||
block_gemm(c_block_tile, a_block_tile1, b_block_tile1);
|
||||
HotLoopScheduler();
|
||||
__builtin_amdgcn_sched_barrier(0);
|
||||
}
|
||||
iCounter -= 2;
|
||||
} while(iCounter > 1);
|
||||
@@ -631,8 +629,7 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
|
||||
Base::LocalPrefetch(a_block_tile1, a_lds_ld_window1, is_a_load_tr_v);
|
||||
Base::LocalPrefetch(b_block_tile1, b_lds_ld_window1, is_b_load_tr_v);
|
||||
block_gemm(c_block_tile, a_block_tile0, b_block_tile0);
|
||||
static_for<0, 8, 1>{}([&](auto i) {
|
||||
ignore = i;
|
||||
static_for<0, 8, 1>{}([&](auto) {
|
||||
__builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
|
||||
__builtin_amdgcn_sched_group_barrier(0x008, 8, 0); // MFMA
|
||||
});
|
||||
|
||||
@@ -263,6 +263,9 @@ using WarpGemmMfma_f32_32x32x16_fp8_fp8 = WarpGemmImpl<
|
||||
using WarpGemmMfma_f32_32x32x16_fp8_bf8 = WarpGemmImpl<
|
||||
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8<WGAttrCtlEnum::Default_>>>;
|
||||
|
||||
using WarpGemmMfma_f32_16x16x32_fp8_bf8 = WarpGemmImpl<
|
||||
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_bf8<WGAttrCtlEnum::Default_>>>;
|
||||
|
||||
using WarpGemmMfma_f32_32x32x16_bf8_fp8 = WarpGemmImpl<
|
||||
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8<WGAttrCtlEnum::Default_>>>;
|
||||
|
||||
@@ -277,6 +280,10 @@ using WarpGemmMfma_f32_32x32x32_bf8_bf8 = WarpGemmImpl<WarpGemmAttributeMfmaIter
|
||||
WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8<WGAttrCtlEnum::Default_>,
|
||||
2>>;
|
||||
|
||||
using WarpGemmMfma_f32_32x32x32_fp8_bf8 = WarpGemmImpl<WarpGemmAttributeMfmaIterateK<
|
||||
WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8<WGAttrCtlEnum::Default_>,
|
||||
2>>;
|
||||
|
||||
using WarpGemmMfma_f32_16x16x32_fp8_fp8 = WarpGemmImpl<
|
||||
WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8<WGAttrCtlEnum::Default_>>>;
|
||||
|
||||
|
||||
@@ -1510,6 +1510,9 @@ using WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_fp8 =
|
||||
template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
|
||||
using WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8 =
|
||||
WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, bf8_t, Ctrl_>;
|
||||
template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
|
||||
using WarpGemmAttributeMfmaImpl_f32_16x16x32_fp8_bf8 =
|
||||
WarpGemmAttributeMfmaImpl_f32_16x16x32_f8_base<fp8_t, bf8_t, Ctrl_>;
|
||||
|
||||
template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
|
||||
using WarpGemmAttributeMfmaImpl_f32_16x16x32_bf8_bf8 =
|
||||
|
||||
@@ -105,6 +105,8 @@ template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32,
|
||||
template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16, 32, true> { using Type = WarpGemmMfma_f32_16x16x32_fp8_fp8_CTransposed; };
|
||||
template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32, 16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8; };
|
||||
template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32, 16, true> { using Type = WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed; };
|
||||
template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 16, 16, 32, false> { using Type = WarpGemmMfma_f32_16x16x32_fp8_bf8; };
|
||||
template<> struct WarpGemmDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float, 32, 32, 32, false> { using Type = WarpGemmMfma_f32_32x32x32_fp8_bf8; };
|
||||
template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32, 16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8; };
|
||||
template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32, 16, true> { using Type = WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed; };
|
||||
template<> struct WarpGemmDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32, 16, false> { using Type = WarpGemmMfma_f32_32x32x16_bf8_bf8; };
|
||||
|
||||
@@ -556,7 +556,12 @@ struct GroupedConvolutionBackwardDataKernel
|
||||
[[nodiscard]] CK_TILE_HOST static const std::string GetName()
|
||||
{
|
||||
// clang-format off
|
||||
return concat('_', "grouped_convolution_backward_data", gemm_prec_str<InDataType, WeiDataType>, GemmPipeline::GetName());
|
||||
return concat('_', "grouped_convolution_backward_data",
|
||||
gemm_prec_str<InDataType, WeiDataType>(),
|
||||
"gemm",
|
||||
GemmPipeline::GetName(),
|
||||
"epilogue",
|
||||
EpiloguePipeline::GetName());
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
|
||||
@@ -412,10 +412,21 @@ struct GroupedConvolutionBackwardWeightKernel
|
||||
{
|
||||
constexpr auto NumGroupsToMerge = GroupedConvTraitsType_::NumGroupsToMerge;
|
||||
// clang-format off
|
||||
if (NumGroupsToMerge > 1)
|
||||
return concat('_', "grouped_convolution_backward_weight", gemm_prec_str<InDataType, WeiDataType>, GemmPipeline::GetName(), "merge", NumGroupsToMerge);
|
||||
else
|
||||
return concat('_', "grouped_convolution_backward_weight", gemm_prec_str<InDataType, WeiDataType>, GemmPipeline::GetName());
|
||||
if (NumGroupsToMerge > 1) {
|
||||
return concat('_', "grouped_convolution_backward_weight",
|
||||
gemm_prec_str<InDataType, WeiDataType>(),
|
||||
"gemm",
|
||||
GemmPipeline::GetName(),
|
||||
"epilogue",
|
||||
EpiloguePipeline::GetName());
|
||||
} else {
|
||||
return concat('_', "grouped_convolution_backward_weight",
|
||||
gemm_prec_str<InDataType, WeiDataType>(),
|
||||
"gemm",
|
||||
GemmPipeline::GetName(),
|
||||
"epilogue",
|
||||
EpiloguePipeline::GetName(), "merge", NumGroupsToMerge);
|
||||
}
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
|
||||
@@ -452,7 +452,12 @@ struct GroupedConvolutionForwardKernel
|
||||
[[nodiscard]] CK_TILE_HOST static const std::string GetName()
|
||||
{
|
||||
// clang-format off
|
||||
return concat('_', "grouped_convolution_forward", gemm_prec_str<InDataType, WeiDataType>, GemmPipeline::GetName());
|
||||
return concat('_', "grouped_convolution_forward",
|
||||
gemm_prec_str<InDataType, WeiDataType>(),
|
||||
"gemm",
|
||||
GemmPipeline::GetName(),
|
||||
"epilogue",
|
||||
EpiloguePipeline::GetName());
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user