mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 17:00:18 +00:00
Batched gemm bf16 (#142)
* add bf16 for batched gemm * batched_gemm_bf16 works * recover accidently changed files
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
#pragma once
|
||||
#include <memory>
|
||||
#include "reference_batched_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -11,6 +12,14 @@ using DeviceGemmNoOpPtr =
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough>;
|
||||
|
||||
void add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances(
|
||||
std::vector<DeviceGemmNoOpPtr>&);
|
||||
void add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances(
|
||||
std::vector<DeviceGemmNoOpPtr>&);
|
||||
void add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances(
|
||||
std::vector<DeviceGemmNoOpPtr>&);
|
||||
void add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances(
|
||||
std::vector<DeviceGemmNoOpPtr>&);
|
||||
void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances(std::vector<DeviceGemmNoOpPtr>&);
|
||||
void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances(std::vector<DeviceGemmNoOpPtr>&);
|
||||
void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances(std::vector<DeviceGemmNoOpPtr>&);
|
||||
@@ -77,6 +86,8 @@ void profile_batched_gemm_impl(int do_verification,
|
||||
f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{}));
|
||||
Tensor<CDataType> c_g_m_n_device_result(
|
||||
f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{}));
|
||||
std::unique_ptr<Tensor<float>> c_f32_g_m_n_host_result = nullptr;
|
||||
std::unique_ptr<Tensor<float>> c_f32_g_m_n_device_result = nullptr;
|
||||
|
||||
std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
|
||||
std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
|
||||
@@ -107,21 +118,56 @@ void profile_batched_gemm_impl(int do_verification,
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
using ReferenceBatchedGemmInstance =
|
||||
ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
|
||||
BDataType,
|
||||
CDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CElementOp>;
|
||||
if constexpr(is_same<ADataType, ck::bhalf_t>::value &&
|
||||
is_same<BDataType, ck::bhalf_t>::value &&
|
||||
is_same<CDataType, ck::bhalf_t>::value)
|
||||
{
|
||||
Tensor<float> a_f32_g_m_k(
|
||||
f_host_tensor_descriptor(BatchCount, M, K, StrideA, ALayout{}));
|
||||
Tensor<float> b_f32_g_k_n(
|
||||
f_host_tensor_descriptor(BatchCount, K, N, StrideB, BLayout{}));
|
||||
c_f32_g_m_n_host_result = std::make_unique<Tensor<float>>(
|
||||
f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{}));
|
||||
c_f32_g_m_n_device_result = std::make_unique<Tensor<float>>(
|
||||
f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{}));
|
||||
|
||||
auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
|
||||
auto ref_invoker = ref_batched_gemm.MakeInvoker();
|
||||
bf16_to_f32_(a_g_m_k, a_f32_g_m_k);
|
||||
bf16_to_f32_(b_g_k_n, b_f32_g_k_n);
|
||||
|
||||
auto ref_argument = ref_batched_gemm.MakeArgument(
|
||||
a_g_m_k, b_g_k_n, c_g_m_n_host_result, a_element_op, b_element_op, c_element_op);
|
||||
using ReferenceBatchedGemmInstance = ck::tensor_operation::host::
|
||||
ReferenceBatchedGemm<float, float, float, AElementOp, BElementOp, CElementOp>;
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
|
||||
auto ref_invoker = ref_batched_gemm.MakeInvoker();
|
||||
|
||||
auto ref_argument = ref_batched_gemm.MakeArgument(a_f32_g_m_k,
|
||||
b_f32_g_k_n,
|
||||
*c_f32_g_m_n_host_result,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
c_element_op);
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
using ReferenceBatchedGemmInstance =
|
||||
ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
|
||||
BDataType,
|
||||
CDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CElementOp>;
|
||||
|
||||
auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
|
||||
auto ref_invoker = ref_batched_gemm.MakeInvoker();
|
||||
|
||||
auto ref_argument = ref_batched_gemm.MakeArgument(
|
||||
a_g_m_k, b_g_k_n, c_g_m_n_host_result, a_element_op, b_element_op, c_element_op);
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
}
|
||||
}
|
||||
|
||||
DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpace());
|
||||
@@ -168,6 +214,38 @@ void profile_batched_gemm_impl(int do_verification,
|
||||
add_device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances(gemm_ptrs);
|
||||
}
|
||||
}
|
||||
else if constexpr(is_same<ADataType, bhalf_t>::value && is_same<BDataType, bhalf_t>::value &&
|
||||
is_same<CDataType, bhalf_t>::value)
|
||||
{
|
||||
if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value &&
|
||||
is_same<BLayout, tensor_layout::gemm::RowMajor>::value &&
|
||||
is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
ck::tensor_operation::device::device_batched_gemm_instance::
|
||||
add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances(gemm_ptrs);
|
||||
}
|
||||
else if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value &&
|
||||
is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value &&
|
||||
is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
ck::tensor_operation::device::device_batched_gemm_instance::
|
||||
add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances(gemm_ptrs);
|
||||
}
|
||||
else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
|
||||
is_same<BLayout, tensor_layout::gemm::RowMajor>::value &&
|
||||
is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
ck::tensor_operation::device::device_batched_gemm_instance::
|
||||
add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances(gemm_ptrs);
|
||||
}
|
||||
else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
|
||||
is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value &&
|
||||
is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
ck::tensor_operation::device::device_batched_gemm_instance::
|
||||
add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances(gemm_ptrs);
|
||||
}
|
||||
}
|
||||
else if constexpr(is_same<ADataType, float>::value && is_same<BDataType, float>::value &&
|
||||
is_same<CDataType, float>::value)
|
||||
{
|
||||
@@ -294,7 +372,19 @@ void profile_batched_gemm_impl(int do_verification,
|
||||
{
|
||||
c_device_buf.FromDevice(c_g_m_n_device_result.mData.data());
|
||||
|
||||
check_error(c_g_m_n_host_result, c_g_m_n_device_result);
|
||||
if constexpr(is_same<ADataType, ck::bhalf_t>::value &&
|
||||
is_same<BDataType, ck::bhalf_t>::value &&
|
||||
is_same<CDataType, ck::bhalf_t>::value)
|
||||
{
|
||||
|
||||
bf16_to_f32_(c_g_m_n_device_result, *c_f32_g_m_n_device_result);
|
||||
check_error(*c_f32_g_m_n_host_result, *c_f32_g_m_n_device_result);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
check_error(c_g_m_n_host_result, c_g_m_n_device_result);
|
||||
}
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user