mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-04 13:41:24 +00:00
Added Multi_ABD support into Gemm and GroupedGemmFixedNK (#978)
* added an example grouped_gemm_multi_abd * fixed ci * add setElementwiseOp * changed API * clean code: add multiA into example * fixed v7r2 copy * add transpose * clean * fixed vector_load check * Update example/15_grouped_gemm/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp Co-authored-by: Bartłomiej Kocot <barkocot@amd.com> * Update example/15_grouped_gemm/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp Co-authored-by: Bartłomiej Kocot <barkocot@amd.com> * Update example/15_grouped_gemm/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp Co-authored-by: Bartłomiej Kocot <barkocot@amd.com> * Update include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp Co-authored-by: Bartłomiej Kocot <barkocot@amd.com> * Update include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp Co-authored-by: Bartłomiej Kocot <barkocot@amd.com> * Update include/ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd.hpp Co-authored-by: Bartłomiej Kocot <barkocot@amd.com> * Update include/ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd.hpp Co-authored-by: Bartłomiej Kocot <barkocot@amd.com> * Update include/ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd.hpp Co-authored-by: Bartłomiej Kocot <barkocot@amd.com> * Update include/ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd.hpp Co-authored-by: Bartłomiej Kocot <barkocot@amd.com> * Update include/ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd.hpp Co-authored-by: Bartłomiej Kocot <barkocot@amd.com> * Update include/ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd.hpp Co-authored-by: Bartłomiej Kocot <barkocot@amd.com> * Update include/ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd_fixed_nk.hpp Co-authored-by: Bartłomiej Kocot <barkocot@amd.com> * Update include/ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd_fixed_nk.hpp Co-authored-by: Bartłomiej Kocot <barkocot@amd.com> * add reduce * testing * add example_b16_i8 * refactor example * clean * add mpading * disable reduce for kbatch = 1 * seperate reduce device op * add reduce op * add guard for workspace_size * add instances * format * fixed * add client example * add a colmajor * add instances * Update cmake-ck-dev.sh * Update profile_gemm_splitk.cpp * Update gridwise_gemm_xdlops_v2r4r2.hpp * format * Update profile_gemm_splitk.cpp * fixed * fixed * adjust test * adjust precision loss * adjust test * fixed * add bf16_i8 scale bias * fixed scale * fixed scale elementwise_op * revert contraction deviceop changes * fixed * Add AddFastGelu * Revert "Merge branch 'jizhan/gemm_splitk_reduce' into grouped_gemm_multi_abd_fixed_nk_example" This reverts commit3b5d001efd, reversing changes made to943199a991. * add Scales into elementwise * add gemm_multi_abd client example * add client examples * add rcr and crr * add grouped gemm client example * add grouped gemm client example * add instance for rcr crr * format * fixed * fixed cmake * fixed * fixed client_example * format * fixed contraction isSupport * Update include/ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd_fixed_nk.hpp Co-authored-by: Bartłomiej Kocot <barkocot@amd.com> * Update device_reduce_threadwise.hpp * clean * Fixes * Fix example --------- Co-authored-by: Jing Zhang <jizha@amd.com> Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
This commit is contained in:
@@ -4,7 +4,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
@@ -92,6 +92,15 @@ struct Add
|
||||
};
|
||||
};
|
||||
|
||||
struct Scales
|
||||
{
|
||||
template <typename Y, typename X0, typename X1>
|
||||
__host__ __device__ constexpr void operator()(Y& y, const X0& x0, const X1& x1) const
|
||||
{
|
||||
y = ck::type_convert<Y>(ck::type_convert<float>(x0) * ck::type_convert<float>(x1));
|
||||
}
|
||||
};
|
||||
|
||||
struct Max
|
||||
{
|
||||
template <typename Y, typename X0, typename X1>
|
||||
@@ -485,6 +494,19 @@ struct AddFastGelu
|
||||
e = type_convert<half_t>(x1_f);
|
||||
}
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void
|
||||
operator()<bhalf_t, bhalf_t, bhalf_t>(bhalf_t& e, const bhalf_t& c, const bhalf_t& d) const
|
||||
{
|
||||
const float x0_f = type_convert<float>(c) + type_convert<float>(d);
|
||||
|
||||
float x1_f = 0;
|
||||
|
||||
FastGelu{}.template operator()<float, float>(x1_f, x0_f);
|
||||
|
||||
e = type_convert<bhalf_t>(x1_f);
|
||||
}
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void
|
||||
operator()<bhalf_t, float, bhalf_t>(bhalf_t& e, const float& c, const bhalf_t& d) const
|
||||
|
||||
@@ -14,6 +14,8 @@ namespace element_wise {
|
||||
template <typename... UnaryOpsSet>
|
||||
struct UnaryCombinedOp
|
||||
{
|
||||
__host__ __device__ UnaryCombinedOp() : unary_ops_() {}
|
||||
|
||||
__host__ __device__ UnaryCombinedOp(UnaryOpsSet... unary_ops) : unary_ops_(unary_ops...) {}
|
||||
|
||||
template <typename Y, typename X>
|
||||
@@ -32,6 +34,8 @@ struct UnaryCombinedOp
|
||||
template <typename BinaryOp, typename UnaryOp0, typename UnaryOp1>
|
||||
struct BinaryWithUnaryCombinedOp
|
||||
{
|
||||
__host__ __device__ BinaryWithUnaryCombinedOp() : binary_op_(), unary_op0_(), unary_op1_() {}
|
||||
|
||||
__host__ __device__ BinaryWithUnaryCombinedOp(BinaryOp binary_op,
|
||||
UnaryOp0 unary_op0,
|
||||
UnaryOp1 unary_op1)
|
||||
@@ -63,6 +67,11 @@ template <typename BinaryOp0,
|
||||
typename UnaryOp2>
|
||||
struct TrinaryWithUnaryCombinedOp
|
||||
{
|
||||
__host__ __device__ TrinaryWithUnaryCombinedOp()
|
||||
: binary_op0_(), binary_op1_(), unary_op0_(), unary_op1_(), unary_op2_()
|
||||
{
|
||||
}
|
||||
|
||||
__host__ __device__ TrinaryWithUnaryCombinedOp(BinaryOp0 binary_op0,
|
||||
BinaryOp0 binary_op1,
|
||||
UnaryOp0 unary_op0,
|
||||
|
||||
@@ -288,10 +288,13 @@ struct ConvertF8RNE
|
||||
|
||||
struct Scale
|
||||
{
|
||||
__host__ __device__ Scale(float scale) : scale_(scale) {}
|
||||
__host__ __device__ Scale(float scale = 1.f) : scale_(scale) {}
|
||||
|
||||
template <typename Y, typename X>
|
||||
__host__ __device__ void operator()(Y& y, const X& x) const;
|
||||
__host__ __device__ void operator()(Y& y, const X& x) const
|
||||
{
|
||||
y = ck::type_convert<Y>(ck::type_convert<float>(x) * scale_);
|
||||
}
|
||||
|
||||
template <>
|
||||
__host__ __device__ void operator()<half_t, half_t>(half_t& y, const half_t& x) const
|
||||
@@ -500,6 +503,36 @@ struct FastGelu
|
||||
|
||||
y = type_convert<half_t>(y_f);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
|
||||
{
|
||||
float y_f;
|
||||
|
||||
this->operator()<float, float>(y_f, x);
|
||||
|
||||
y = type_convert<bhalf_t>(y_f);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void operator()<bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x) const
|
||||
{
|
||||
float y_f;
|
||||
|
||||
this->operator()<float, float>(y_f, type_convert<float>(x));
|
||||
|
||||
y = type_convert<bhalf_t>(y_f);
|
||||
}
|
||||
|
||||
template <>
|
||||
__host__ void operator()<bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x) const
|
||||
{
|
||||
float y_f;
|
||||
|
||||
this->operator()<float, float>(y_f, type_convert<float>(x));
|
||||
|
||||
y = type_convert<bhalf_t>(y_f);
|
||||
}
|
||||
};
|
||||
|
||||
// https://paperswithcode.com/method/gelu
|
||||
|
||||
Reference in New Issue
Block a user