Grouped Gemm with Fixed K and N with SplitK (#818)

* move all arguments into device

* add b2c_tile_map

* add examples

* add SetDeviceKernelArgs

* dedicated fixed_nk solution

* init client api

* add grouped_gemm_bias example

* add a instance

* add instances

* formatting

* fixed cmake

* Update EnableCompilerWarnings.cmake

* Update cmake-ck-dev.sh

* clean; fixed comments

* fixed comment

* add instances for fp32 output

* add instances for fp32 output

* add fp32 out client example

* fixed CI

* init commit for kbatch

* add splitk gridwise

* format

* fixed

* clean deviceop

* clean code

* finish splitk

* fixed instances

* change m_loops to tile_loops

* add setkbatch

* clean code

* add splitK+bias

* add instances

* opt mk_nk instances

* clean examples

* fixed CI

* remove zero

* finished non-zero

* clean

* clean code

* optimized global_barrier

* fixed ci

* fixed CI

* removed AddBias

* format

* fixed CI

* fixed CI

* move 20_grouped_gemm to 21_grouped_gemm

---------

Co-authored-by: Jing Zhang <jizha@amd.com>

[ROCm/composable_kernel commit: f5ec04f091]
This commit is contained in:
zjing14
2023-08-31 09:22:12 -05:00
committed by GitHub
parent e422d088a3
commit bc88b1a50b
22 changed files with 3593 additions and 4 deletions

View File

@@ -101,6 +101,7 @@ using MultiplyAdd = ck::tensor_operation::element_wise::MultiplyAdd;
using ScaleAdd = ck::tensor_operation::element_wise::ScaleAdd;
using Gelu = ck::tensor_operation::element_wise::Gelu;
using Swish = ck::tensor_operation::element_wise::Swish;
using Add = ck::tensor_operation::element_wise::Add;
template <typename Activation>
using Activation_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<Activation>;

View File

@@ -0,0 +1,146 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <memory>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// fp16_output
void add_device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_kn_mn_instances(
std::vector<std::unique_ptr<DeviceGroupedGemmFixedNK<Row,
Row,
Row_Tuple,
Row,
F16,
F16,
F32_Tuple,
F16,
PassThrough,
PassThrough,
Add>>>& instances);
void add_device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_nk_mn_instances(
std::vector<std::unique_ptr<DeviceGroupedGemmFixedNK<Row,
Col,
Row_Tuple,
Row,
F16,
F16,
F32_Tuple,
F16,
PassThrough,
PassThrough,
Add>>>& instances);
// fp32_output
void add_device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_kn_mn_instances(
std::vector<std::unique_ptr<DeviceGroupedGemmFixedNK<Row,
Row,
Row_Tuple,
Row,
F16,
F16,
F32_Tuple,
F32,
PassThrough,
PassThrough,
Add>>>& instances);
void add_device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_nk_mn_instances(
std::vector<std::unique_ptr<DeviceGroupedGemmFixedNK<Row,
Col,
Row_Tuple,
Row,
F16,
F16,
F32_Tuple,
F32,
PassThrough,
PassThrough,
Add>>>& instances);
template <typename ALayout,
typename BLayout,
typename ELayout,
typename ADataType,
typename BDataType,
typename EDataType>
struct DeviceOperationInstanceFactory<
ck::tensor_operation::device::DeviceGroupedGemmFixedNK<ALayout,
BLayout,
Row_Tuple,
ELayout,
ADataType,
BDataType,
F32_Tuple,
EDataType,
PassThrough,
PassThrough,
Add>>
{
using DeviceOp = DeviceGroupedGemmFixedNK<ALayout,
BLayout,
Row_Tuple,
ELayout,
ADataType,
BDataType,
F32_Tuple,
EDataType,
PassThrough,
PassThrough,
Add>;
static auto GetInstances()
{
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
// fp16_output
if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
is_same_v<EDataType, half_t>)
{
if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
is_same_v<ELayout, Row>)
{
add_device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
}
if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
is_same_v<ELayout, Row>)
{
add_device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
}
}
// fp32_output
if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
is_same_v<EDataType, float>)
{
if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
is_same_v<ELayout, Row>)
{
add_device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_kn_mn_instances(op_ptrs);
}
if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
is_same_v<ELayout, Row>)
{
add_device_grouped_gemm_xdl_fixed_nk_bias_f16_f16_f32_mk_nk_mn_instances(op_ptrs);
}
}
return op_ptrs;
}
};
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -26,7 +26,9 @@ struct DeviceMem
void* GetDeviceBuffer() const;
std::size_t GetBufferSize() const;
void ToDevice(const void* p) const;
void ToDevice(const void* p, const std::size_t cpySize) const;
void FromDevice(void* p) const;
void FromDevice(void* p, const std::size_t cpySize) const;
void SetZero() const;
template <typename T>
void SetValue(T x) const;