mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-20 04:49:54 +00:00
Add BF16 tests for batched_gemm_softmax_gemm_permute (#504)
* fixed bug in softmax reference & add bf16 examples for batched_gemm_scale_softmax_gemm
* added bf16 tests for batched_gemm_softmax_gemm_permute
* changed format of device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
* changed format device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
* aligned annotations
* modified CMakeLists for examples
* add common example code of fp16/bf16 version for batched_gemm_scale_softmax_gemm_xdl
* use macro to control the instances
* added macro control into instances
* clang-format some files
* changed error tolerance for bf16
* changed index for 10_elementwise_normalization
* fixed xdlops code bug in amd_xdlops.hpp
Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com>
[ROCm/composable_kernel commit: 4c4c7328a6]
This commit is contained in:
@@ -254,7 +254,7 @@ struct intrin_mfma_f32_16x16x8bf16<16, 16>
|
||||
template <class FloatC>
|
||||
__device__ static void Run(const bhalf2_t& reg_a, const bhalf2_t& reg_b, FloatC& reg_c)
|
||||
{
|
||||
reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_32x32x4bf16(
|
||||
reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x8bf16(
|
||||
reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 0, 0, 0);
|
||||
}
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user