diff --git a/CMakeLists.txt b/CMakeLists.txt index cd03961477..3558666e5d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -246,13 +246,6 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500500000) add_compile_options("SHELL: -mllvm --lsr-drop-solution=1") endif() endif() -# if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090) -# check_cxx_compiler_flag("-mllvm -enable-post-misched=0" HAS_ENABLE_POST_MISCHED) -# if(HAS_ENABLE_POST_MISCHED) -# message("Adding the enable-post-misched=0 compiler flag") -# add_compile_options("SHELL: -mllvm -enable-post-misched=0") -# endif() -# endif() set(check-coerce) check_cxx_compiler_flag(" -mllvm -amdgpu-coerce-illegal-types=1" check-coerce) if(NOT WIN32 AND check-coerce AND ${hip_VERSION_FLAT} GREATER 600241132) diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp index 34b9e0f6b1..c33fe357d8 100644 --- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp +++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp @@ -24,10 +24,9 @@ template using S = ck::Sequence; -using F16 = ck::half_t; -using FP8 = ck::f8_t; -using F32 = float; -using BF16 = ck::bhalf_t; +using F16 = ck::half_t; +using FP8 = ck::f8_t; +using F32 = float; using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; @@ -39,7 +38,7 @@ using CShuffleDataType = F32; using D0DataType = F32; using D1DataType = F32; using DsDataType = ck::Tuple; -using EDataType = BF16; +using EDataType = F16; using A0Layout = Row; using B0Layout = Col; @@ -48,23 +47,21 @@ using D1Layout = Col; using DsLayout = ck::Tuple; using ELayout = Row; -// struct MultiplyMultiply -// { -// template -// __host__ __device__ constexpr void -// operator()(E& e, const C& c, const D0& d0, const D1& d1) const; +struct MultiplyMultiply +{ + template + __host__ __device__ constexpr void + operator()(E& e, const C& c, const D0& d0, const D1& d1) const; -// template <> -// __host__ __device__ constexpr void operator()( -// ck::half_t& e, const float& c, const float& d0, const float& d1) const -// { -// const float x0_f = c * d0 * d1; + template <> + __host__ __device__ constexpr void operator()( + ck::half_t& e, const float& c, const float& d0, const float& d1) const + { + const float x0_f = c * d0 * d1; -// e = ck::type_convert(x0_f); -// } -// }; - -using MultiplyMultiply = ck::tensor_operation::element_wise::MultiplyMultiply; + e = ck::type_convert(x0_f); + } +}; using PassThrough = ck::tensor_operation::element_wise::PassThrough; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp index 1c5080ea1b..a9e73bf461 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp @@ -688,19 +688,40 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3 // in some cases. else if constexpr(is_same::value) { - constexpr auto a_lds_block_desc = - make_naive_tensor_descriptor(make_tuple(AK0Number, Number{}, AK1Number), - make_tuple(AK1Number, Number{}, I1)); + constexpr auto MLdsLayer = 32 * 4 / KPerBlock / sizeof(LDSTypeA) < 1 + ? 1 + : 32 * 4 / KPerBlock / sizeof(LDSTypeA); + constexpr auto a_lds_block_desc = make_naive_tensor_descriptor( + make_tuple( + AK0Number * Number{}, Number{}, AK1Number), + make_tuple(AK1Number, Number{}, I1)); constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor( a_lds_block_desc, - make_tuple(make_xor_with_modulo_transform( - make_tuple(Number{}, Number{})), + make_tuple(make_xor_with_modulo_transform(make_tuple( + Number{}, Number{})), make_pass_through_transform(AK1Number)), make_tuple(Sequence<1, 0>{}, Sequence<2>{}), make_tuple(Sequence<1, 0>{}, Sequence<2>{})); - return a_lds_block_desc_permuted; + constexpr auto a_lds_block_desc_ak0_mldslayer_m_ak1 = transform_tensor_descriptor( + a_lds_block_desc_permuted, + make_tuple(make_unmerge_transform(make_tuple(AK0Number, Number{})), + make_pass_through_transform(Number{}), + make_pass_through_transform(AK1Number)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{})); + + constexpr auto a_lds_block_desc_ak0_m_ak1 = transform_tensor_descriptor( + a_lds_block_desc_ak0_mldslayer_m_ak1, + make_tuple(make_pass_through_transform(AK0Number), + make_merge_transform_v3_division_mod( + make_tuple(Number{}, Number{})), + make_pass_through_transform(AK1Number)), + make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + + return a_lds_block_desc_ak0_m_ak1; } else // ColumnMajor A { @@ -804,19 +825,42 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3 } else if constexpr(is_same::value) { - constexpr auto b_lds_block_desc = - make_naive_tensor_descriptor(make_tuple(BK0Number, Number{}, BK1Number), - make_tuple(BK1Number, Number{}, I1)); + // NLdsLayer * K0 as logical Bank + constexpr auto NLdsLayer = 32 * 4 / KPerBlock / sizeof(LDSTypeB) < 1 + ? 1 + : 32 * 4 / KPerBlock / sizeof(LDSTypeB); + ; + constexpr auto b_lds_block_desc = make_naive_tensor_descriptor( + make_tuple( + BK0Number * Number{}, Number{}, BK1Number), + make_tuple(BK1Number, Number{}, I1)); constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor( b_lds_block_desc, - make_tuple(make_xor_with_modulo_transform( - make_tuple(Number{}, Number{})), + make_tuple(make_xor_with_modulo_transform(make_tuple( + Number{}, Number{})), make_pass_through_transform(BK1Number)), make_tuple(Sequence<1, 0>{}, Sequence<2>{}), make_tuple(Sequence<1, 0>{}, Sequence<2>{})); - return b_lds_block_desc_permuted; + constexpr auto b_lds_block_desc_bk0_nldslayer_n_bk1 = transform_tensor_descriptor( + b_lds_block_desc_permuted, + make_tuple(make_unmerge_transform(make_tuple(BK0Number, Number{})), + make_pass_through_transform(Number{}), + make_pass_through_transform(BK1Number)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{}, Sequence<3>{})); + + constexpr auto b_lds_block_desc_bk0_n_bk1 = transform_tensor_descriptor( + b_lds_block_desc_bk0_nldslayer_n_bk1, + make_tuple(make_pass_through_transform(BK0Number), + make_merge_transform_v3_division_mod( + make_tuple(Number{}, Number{})), + make_pass_through_transform(BK1Number)), + make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + + return b_lds_block_desc_bk0_n_bk1; } else // RowMajor B { diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt index 58a6dc6f5f..2b059592e8 100644 --- a/profiler/src/CMakeLists.txt +++ b/profiler/src/CMakeLists.txt @@ -59,6 +59,8 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") list(APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp) list(APPEND PROFILER_SOURCES profile_gemm_splitk.cpp) list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp) + list(APPEND PROFILER_SOURCES profile_gemm_b_scale.cpp) + list(APPEND PROFILER_SOURCES profile_batched_gemm_b_scale.cpp) list(APPEND PROFILER_SOURCES profile_gemm_universal_batched.cpp) list(APPEND PROFILER_SOURCES profile_gemm_universal_reduce.cpp) list(APPEND PROFILER_SOURCES profile_gemm_universal_streamk.cpp) @@ -143,6 +145,8 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") endif() target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance) + target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_b_scale_instance) + target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_b_scale_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_batched_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_reduce_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_streamk_instance)