mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-07-03 05:37:34 +00:00
clean
This commit is contained in:
@@ -27,7 +27,7 @@ using DeviceGemmV2Instance =
|
||||
ALayout, BLayout, CLayout,
|
||||
ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
|
||||
AElementOp, BElementOp, CElementOp, GemmDefault,
|
||||
#if 0
|
||||
#if 1
|
||||
64,
|
||||
16, 16,
|
||||
256, 8, 16,
|
||||
|
||||
@@ -256,7 +256,14 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
|
||||
// get_rtol<CDataType>(),
|
||||
// get_atol<CDataType>());
|
||||
|
||||
LogRangeAsType<float>(std::cout << "c_m_n_device_buf : ", c_m_n_device_result.mData, ",") << std::endl;
|
||||
//for(int i = 0; i < M; i++)
|
||||
//{
|
||||
// for(int j = 0; j < N; j++)
|
||||
// {
|
||||
// std::cout << ck::type_convert<float>(c_m_n_device_result(i, j)) << ",";
|
||||
// }
|
||||
// std::cout << std::endl;
|
||||
//}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ struct PassThroughPack2
|
||||
|
||||
__host__ __device__ constexpr void operator()(ck::half2_t& y, const ck::pk_i4_t& x) const
|
||||
{
|
||||
#if 1
|
||||
#if 0
|
||||
uint8_t x_u8 = ck::bit_cast<uint8_t>(x);
|
||||
uint8_t x_l = (x_u8 & 0x0f) >> 0;
|
||||
uint8_t x_h = (x_u8 & 0xf0) >> 4;
|
||||
|
||||
@@ -151,6 +151,20 @@ struct GridwiseGemm_xdl_cshuffle_v3
|
||||
|
||||
using ThisThreadBlock = ThisThreadBlock<BlockSize>;
|
||||
|
||||
static constexpr index_t APackedSize = []() {
|
||||
if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
|
||||
return 2;
|
||||
else
|
||||
return 1;
|
||||
}();
|
||||
|
||||
static constexpr index_t BPackedSize = []() {
|
||||
if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
|
||||
return 2;
|
||||
else
|
||||
return 1;
|
||||
}();
|
||||
|
||||
__host__ static auto CalculateGridSize(index_t M, index_t N, index_t KBatch)
|
||||
{
|
||||
return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, KBatch);
|
||||
@@ -625,9 +639,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
|
||||
// in some cases.
|
||||
else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
|
||||
{
|
||||
constexpr auto MLdsLayer = 32 * 4 / KPerBlock / sizeof(ADataType) < 1
|
||||
? 1
|
||||
: 32 * 4 / KPerBlock / sizeof(ADataType);
|
||||
constexpr index_t LdsSize = 32 * 4 / KPerBlock / sizeof(ADataType);
|
||||
constexpr auto MLdsLayer = LdsSize < 1 ? 1 : LdsSize;
|
||||
constexpr auto a_lds_block_desc = make_naive_tensor_descriptor(
|
||||
make_tuple(
|
||||
AK0Number * Number<MLdsLayer>{}, Number<MPerBlock / MLdsLayer>{}, AK1Number),
|
||||
@@ -761,10 +774,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
|
||||
else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
|
||||
{
|
||||
// NLdsLayer * K0 as logical Bank
|
||||
constexpr auto NLdsLayer = 32 * 4 / KPerBlock / sizeof(BDataType) < 1
|
||||
? 1
|
||||
: 32 * 4 / KPerBlock / sizeof(BDataType);
|
||||
;
|
||||
constexpr index_t LdsSize = 32 * 4 / KPerBlock / sizeof(BDataType);
|
||||
constexpr auto NLdsLayer = LdsSize < 1 ? 1 : LdsSize;
|
||||
constexpr auto b_lds_block_desc = make_naive_tensor_descriptor(
|
||||
make_tuple(
|
||||
BK0Number * Number<NLdsLayer>{}, Number<NPerBlock / NLdsLayer>{}, BK1Number),
|
||||
@@ -924,20 +935,6 @@ struct GridwiseGemm_xdl_cshuffle_v3
|
||||
NXdlPerWave,
|
||||
KPack>())>;
|
||||
|
||||
static constexpr index_t APackedSize = []() {
|
||||
if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
|
||||
return 2;
|
||||
else
|
||||
return 1;
|
||||
}();
|
||||
|
||||
static constexpr index_t BPackedSize = []() {
|
||||
if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
|
||||
return 2;
|
||||
else
|
||||
return 1;
|
||||
}();
|
||||
|
||||
__device__ static constexpr index_t GetSharedMemoryNumberOfByte()
|
||||
{
|
||||
// LDS allocation for A and B: be careful of alignment
|
||||
@@ -1326,8 +1323,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
|
||||
static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize() / APackedSize);
|
||||
|
||||
auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
|
||||
bit_cast<BDataType*>(bit_cast<unsigned char *>(p_shared) +
|
||||
a_block_space_size_aligned * sizeof(ADataType)),
|
||||
reinterpret_cast<BDataType*>(static_cast<ADataType*>(p_shared) +
|
||||
a_block_space_size_aligned),
|
||||
b_block_desc_bk0_n_bk1.GetElementSpaceSize() / BPackedSize);
|
||||
|
||||
constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
|
||||
|
||||
@@ -1211,10 +1211,6 @@ struct ThreadwiseTensorSliceTransfer_v4
|
||||
dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
|
||||
|
||||
dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
|
||||
|
||||
|
||||
if constexpr(is_same_v<remove_cvref_t<SrcData>, half_t>)
|
||||
printf("v4: %f %d\n", type_convert<float>(dst_buf[Number<dst_offset>{}]), threadIdx.x);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
@@ -554,9 +554,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1
|
||||
dst_element_op_(dst_v, dst_vector_container.template AsType<DstData>()[i]);
|
||||
|
||||
dst_vector_container.template AsType<DstData>()(i) = dst_v;
|
||||
|
||||
//if constexpr(is_same_v<remove_cvref_t<SrcData>, half_t>)
|
||||
//printf("v3r1: %f %d\n", type_convert<float>(dst_v), threadIdx.x);
|
||||
});
|
||||
|
||||
// copy data from dst_vector_container to dst_buf
|
||||
|
||||
@@ -157,16 +157,6 @@ struct intrin_mfma_f32_16x16x16f16<16, 16>
|
||||
template <class FloatC>
|
||||
__device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
|
||||
{
|
||||
auto tmp_a = vector_type<half_t, 4>{reg_a};
|
||||
auto tmp_b = vector_type<half_t, 4>{reg_b};
|
||||
printf("{%f %f}, {%f %f}, {%f %f}, {%f %f} %d %d\n",
|
||||
static_cast<float>(tmp_a.template AsType<half_t>()(Number<0>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<0>{})),
|
||||
static_cast<float>(tmp_a.template AsType<half_t>()(Number<1>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<1>{})),
|
||||
static_cast<float>(tmp_a.template AsType<half_t>()(Number<2>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<2>{})),
|
||||
static_cast<float>(tmp_a.template AsType<half_t>()(Number<3>{})), static_cast<float>(tmp_b.template AsType<half_t>()(Number<3>{})),
|
||||
threadIdx.x, blockIdx.x
|
||||
);
|
||||
|
||||
reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x16f16(
|
||||
reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 0, 0, 0);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user