diff --git a/example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp b/example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp index 3e09a04fac..eaa8df4ca4 100644 --- a/example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp +++ b/example/01_gemm/gemm_xdl_fp16_fp8_v3.cpp @@ -27,7 +27,7 @@ using DeviceGemmV2Instance = ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, -#if 0 +#if 1 64, 16, 16, 256, 8, 16, diff --git a/example/01_gemm/run_gemm_example_v2.inc b/example/01_gemm/run_gemm_example_v2.inc index c3f4f58439..1b042fe0ee 100644 --- a/example/01_gemm/run_gemm_example_v2.inc +++ b/example/01_gemm/run_gemm_example_v2.inc @@ -256,7 +256,14 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) // get_rtol(), // get_atol()); - LogRangeAsType(std::cout << "c_m_n_device_buf : ", c_m_n_device_result.mData, ",") << std::endl; + //for(int i = 0; i < M; i++) + //{ + // for(int j = 0; j < N; j++) + // { + // std::cout << ck::type_convert(c_m_n_device_result(i, j)) << ","; + // } + // std::cout << std::endl; + //} #endif } diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index 618813b781..83d2193eef 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -25,7 +25,7 @@ struct PassThroughPack2 __host__ __device__ constexpr void operator()(ck::half2_t& y, const ck::pk_i4_t& x) const { -#if 1 +#if 0 uint8_t x_u8 = ck::bit_cast(x); uint8_t x_l = (x_u8 & 0x0f) >> 0; uint8_t x_h = (x_u8 & 0xf0) >> 4; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp index e89736d7c5..1037ec410f 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp @@ -151,6 +151,20 @@ struct GridwiseGemm_xdl_cshuffle_v3 using ThisThreadBlock = ThisThreadBlock; + static constexpr index_t APackedSize = []() { + if constexpr(is_same_v, pk_i4_t>) + return 2; + else + return 1; + }(); + + static constexpr index_t BPackedSize = []() { + if constexpr(is_same_v, pk_i4_t>) + return 2; + else + return 1; + }(); + __host__ static auto CalculateGridSize(index_t M, index_t N, index_t KBatch) { return std::make_tuple(Block2CTileMap::CalculateGridSize(M, N), 1, KBatch); @@ -625,9 +639,8 @@ struct GridwiseGemm_xdl_cshuffle_v3 // in some cases. else if constexpr(is_same::value) { - constexpr auto MLdsLayer = 32 * 4 / KPerBlock / sizeof(ADataType) < 1 - ? 1 - : 32 * 4 / KPerBlock / sizeof(ADataType); + constexpr index_t LdsSize = 32 * 4 / KPerBlock / sizeof(ADataType); + constexpr auto MLdsLayer = LdsSize < 1 ? 1 : LdsSize; constexpr auto a_lds_block_desc = make_naive_tensor_descriptor( make_tuple( AK0Number * Number{}, Number{}, AK1Number), @@ -761,10 +774,8 @@ struct GridwiseGemm_xdl_cshuffle_v3 else if constexpr(is_same::value) { // NLdsLayer * K0 as logical Bank - constexpr auto NLdsLayer = 32 * 4 / KPerBlock / sizeof(BDataType) < 1 - ? 1 - : 32 * 4 / KPerBlock / sizeof(BDataType); - ; + constexpr index_t LdsSize = 32 * 4 / KPerBlock / sizeof(BDataType); + constexpr auto NLdsLayer = LdsSize < 1 ? 1 : LdsSize; constexpr auto b_lds_block_desc = make_naive_tensor_descriptor( make_tuple( BK0Number * Number{}, Number{}, BK1Number), @@ -924,20 +935,6 @@ struct GridwiseGemm_xdl_cshuffle_v3 NXdlPerWave, KPack>())>; - static constexpr index_t APackedSize = []() { - if constexpr(is_same_v, pk_i4_t>) - return 2; - else - return 1; - }(); - - static constexpr index_t BPackedSize = []() { - if constexpr(is_same_v, pk_i4_t>) - return 2; - else - return 1; - }(); - __device__ static constexpr index_t GetSharedMemoryNumberOfByte() { // LDS allocation for A and B: be careful of alignment @@ -1326,8 +1323,8 @@ struct GridwiseGemm_xdl_cshuffle_v3 static_cast(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize() / APackedSize); auto b_block_buf = make_dynamic_buffer( - bit_cast(bit_cast(p_shared) + - a_block_space_size_aligned * sizeof(ADataType)), + reinterpret_cast(static_cast(p_shared) + + a_block_space_size_aligned), b_block_desc_bk0_n_bk1.GetElementSpaceSize() / BPackedSize); constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0); diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index c4b96def41..a0a0e7f230 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1211,10 +1211,6 @@ struct ThreadwiseTensorSliceTransfer_v4 dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector); dst_buf(Number{}) = dst_tmp_vector.template AsType()[i]; - - - if constexpr(is_same_v, half_t>) - printf("v4: %f %d\n", type_convert(dst_buf[Number{}]), threadIdx.x); }); } }); diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp index 6beaf9155a..6b4b2a8a04 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp @@ -554,9 +554,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1 dst_element_op_(dst_v, dst_vector_container.template AsType()[i]); dst_vector_container.template AsType()(i) = dst_v; - - //if constexpr(is_same_v, half_t>) - //printf("v3r1: %f %d\n", type_convert(dst_v), threadIdx.x); }); // copy data from dst_vector_container to dst_buf diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp index f2db1c95a9..d8ccb2ea76 100644 --- a/include/ck/utility/amd_xdlops.hpp +++ b/include/ck/utility/amd_xdlops.hpp @@ -157,16 +157,6 @@ struct intrin_mfma_f32_16x16x16f16<16, 16> template __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c) { - auto tmp_a = vector_type{reg_a}; - auto tmp_b = vector_type{reg_b}; - printf("{%f %f}, {%f %f}, {%f %f}, {%f %f} %d %d\n", - static_cast(tmp_a.template AsType()(Number<0>{})), static_cast(tmp_b.template AsType()(Number<0>{})), - static_cast(tmp_a.template AsType()(Number<1>{})), static_cast(tmp_b.template AsType()(Number<1>{})), - static_cast(tmp_a.template AsType()(Number<2>{})), static_cast(tmp_b.template AsType()(Number<2>{})), - static_cast(tmp_a.template AsType()(Number<3>{})), static_cast(tmp_b.template AsType()(Number<3>{})), - threadIdx.x, blockIdx.x - ); - reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x16f16( reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 0, 0, 0); }