Grouped Gemm device with multiD grid (#319)

* replace gridwise_v2r3 with multiD

* adjust parameters

* add instances

* fixed test_grouped_gemm

* fix standalone softmax race condition around blockwise reduction

* fixed ci

* fixed comment: remove redundant workspace

* use instanceFactory

* add test layout

* add empty Ds

* add bias example

* use array

* sperate examples

Co-authored-by: Anthony Chang <ac.chang@outlook.com>

[ROCm/composable_kernel commit: 7959dad566]
This commit is contained in:
zjing14
2022-07-21 10:07:01 -05:00
committed by GitHub
parent fab458978a
commit cab6416fa5
14 changed files with 1161 additions and 665 deletions

View File

@@ -7,9 +7,11 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
@@ -17,41 +19,17 @@
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using DeviceGroupedGemmNoOpPtr = ck::tensor_operation::device::DeviceGroupedGemmPtr<
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
void add_device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(
std::vector<DeviceGroupedGemmNoOpPtr>&);
void add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(
std::vector<DeviceGroupedGemmNoOpPtr>&);
void add_device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances(
std::vector<DeviceGroupedGemmNoOpPtr>&);
void add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances(
std::vector<DeviceGroupedGemmNoOpPtr>&);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
namespace ck {
namespace profiler {
template <typename ADataType,
typename BDataType,
typename CDataType,
typename EDataType,
typename AccDataType,
typename ALayout,
typename BLayout,
typename CLayout>
void profile_grouped_gemm_impl(int do_verification,
bool profile_grouped_gemm_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
@@ -62,6 +40,9 @@ void profile_grouped_gemm_impl(int do_verification,
const std::vector<int>& StrideBs,
const std::vector<int>& StrideCs)
{
bool pass = true;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
@@ -86,7 +67,7 @@ void profile_grouped_gemm_impl(int do_verification,
std::vector<Tensor<ADataType>> a_m_k;
std::vector<Tensor<BDataType>> b_k_n;
std::vector<Tensor<CDataType>> c_m_n_device_results;
std::vector<Tensor<EDataType>> c_m_n_device_results;
for(std::size_t i = 0; i < group_count; i++)
{
@@ -96,7 +77,7 @@ void profile_grouped_gemm_impl(int do_verification,
Tensor<BDataType>(f_host_tensor_descriptor(Ks[i], Ns[i], StrideBs[i], BLayout{})));
c_m_n_device_results.push_back(
Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
Tensor<EDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" << i
<< "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
@@ -115,7 +96,7 @@ void profile_grouped_gemm_impl(int do_verification,
b_k_n[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
}
c_m_n_device_results[i].GenerateTensorValue(GeneratorTensor_0<CDataType>{}, num_thread);
c_m_n_device_results[i].GenerateTensorValue(GeneratorTensor_0<EDataType>{}, num_thread);
}
using AElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -145,9 +126,9 @@ void profile_grouped_gemm_impl(int do_verification,
p_b.reserve(group_count);
p_c.reserve(group_count);
std::vector<ck::tensor_operation::device::GemmShape> gemm_shapes;
std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
gemm_shapes.reserve(group_count);
gemm_descs.reserve(group_count);
for(std::size_t i = 0; i < group_count; i++)
{
@@ -157,56 +138,34 @@ void profile_grouped_gemm_impl(int do_verification,
std::make_unique<DeviceMem>(sizeof(BDataType) * b_k_n[i].mDesc.GetElementSpace()));
c_device_buf.emplace_back(std::make_unique<DeviceMem>(
sizeof(CDataType) * c_m_n_device_results[i].mDesc.GetElementSpace()));
sizeof(EDataType) * c_m_n_device_results[i].mDesc.GetElementSpace()));
a_device_buf[i]->ToDevice(a_m_k[i].mData.data());
b_device_buf[i]->ToDevice(b_k_n[i].mData.data());
c_device_buf[i]->ToDevice(c_m_n_device_results[i].mData.data());
gemm_shapes.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i]});
gemm_descs.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i], {}});
p_a.push_back(a_device_buf[i]->GetDeviceBuffer());
p_b.push_back(b_device_buf[i]->GetDeviceBuffer());
p_c.push_back(c_device_buf[i]->GetDeviceBuffer());
}
// add device GEMM instances
std::vector<ck::tensor_operation::device::instance::DeviceGroupedGemmNoOpPtr> gemm_ptrs;
using DeviceOp = ck::tensor_operation::device::DeviceGroupedGemm<ALayout,
BLayout,
CLayout,
ADataType,
BDataType,
ck::Tuple<>,
EDataType,
AElementOp,
BElementOp,
CElementOp>;
if constexpr(is_same<ADataType, half_t>::value && is_same<BDataType, half_t>::value &&
is_same<CDataType, half_t>::value)
{
if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value &&
is_same<BLayout, tensor_layout::gemm::RowMajor>::value &&
is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
{
ck::tensor_operation::device::instance::
add_device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(gemm_ptrs);
}
else if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value &&
is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value &&
is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
{
ck::tensor_operation::device::instance::
add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(gemm_ptrs);
}
else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
is_same<BLayout, tensor_layout::gemm::RowMajor>::value &&
is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
{
ck::tensor_operation::device::instance::
add_device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances(gemm_ptrs);
}
else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value &&
is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
{
ck::tensor_operation::device::instance::
add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances(gemm_ptrs);
}
}
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
if(gemm_ptrs.size() <= 0)
if(op_ptrs.size() <= 0)
{
throw std::runtime_error("wrong! no device GEMM instance found");
}
@@ -216,14 +175,17 @@ void profile_grouped_gemm_impl(int do_verification,
float best_tflops = 0;
float best_gb_per_sec = 0;
auto p_ds = std::vector<std::array<const void*, 0>>{};
// profile device GEMM instances
for(auto& gemm_ptr : gemm_ptrs)
for(auto& gemm_ptr : op_ptrs)
{
auto argument_ptr =
gemm_ptr->MakeArgumentPointer(p_a,
p_b,
p_ds,
p_c,
gemm_shapes,
gemm_descs,
ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{});
@@ -242,12 +204,12 @@ void profile_grouped_gemm_impl(int do_verification,
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = 0, num_btype = 0;
for(std::size_t i = 0; i < gemm_shapes.size(); i++)
for(std::size_t i = 0; i < gemm_descs.size(); i++)
{
flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i];
num_btype += sizeof(ADataType) * Ms[i] * Ks[i] + sizeof(BDataType) * Ks[i] * Ns[i] +
sizeof(CDataType) * Ms[i] * Ns[i];
sizeof(EDataType) * Ms[i] * Ns[i];
}
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
@@ -266,18 +228,18 @@ void profile_grouped_gemm_impl(int do_verification,
if(do_verification)
{
for(std::size_t i = 0; i < gemm_shapes.size(); i++)
for(std::size_t i = 0; i < gemm_descs.size(); i++)
{
c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data());
Tensor<CDataType> c_m_n_host_result(
Tensor<EDataType> c_m_n_host_result(
f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{}));
using ReferenceGemmInstance =
ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
CDataType,
EDataType,
AccDataType,
AElementOp,
BElementOp,
@@ -294,7 +256,8 @@ void profile_grouped_gemm_impl(int do_verification,
c_element_op);
ref_invoker.Run(ref_argument);
ck::utils::check_err(c_m_n_device_results[i].mData, c_m_n_host_result.mData);
pass = pass && ck::utils::check_err(c_m_n_device_results[i].mData,
c_m_n_host_result.mData);
if(do_log)
{
@@ -319,6 +282,8 @@ void profile_grouped_gemm_impl(int do_verification,
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
return pass;
} // namespace profiler
} // namespace profiler