Added Multi_ABD support into Gemm and GroupedGemmFixedNK (#978)

* added an example grouped_gemm_multi_abd

* fixed ci

* add setElementwiseOp

* changed API

* clean code: add multiA into example

* fixed v7r2 copy

* add transpose

* clean

* fixed vector_load check

* Update example/15_grouped_gemm/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update example/15_grouped_gemm/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update example/15_grouped_gemm/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd_fixed_nk.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update include/ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd_fixed_nk.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* add reduce

* testing

* add example_b16_i8

* refactor example

* clean

* add mpading

* disable reduce for kbatch = 1

* seperate reduce device op

* add reduce op

* add guard for workspace_size

* add instances

* format

* fixed

* add client example

* add a colmajor

* add instances

* Update cmake-ck-dev.sh

* Update profile_gemm_splitk.cpp

* Update gridwise_gemm_xdlops_v2r4r2.hpp

* format

* Update profile_gemm_splitk.cpp

* fixed

* fixed

* adjust test

* adjust precision loss

* adjust test

* fixed

* add bf16_i8 scale bias

* fixed scale

* fixed scale elementwise_op

* revert contraction deviceop changes

* fixed

* Add AddFastGelu

* Revert "Merge branch 'jizhan/gemm_splitk_reduce' into grouped_gemm_multi_abd_fixed_nk_example"

This reverts commit 3b5d001efd, reversing
changes made to 943199a991.

* add Scales into elementwise

* add gemm_multi_abd client example

* add client examples

* add rcr and crr

* add grouped gemm client example

* add grouped gemm client example

* add instance for rcr crr

* format

* fixed

* fixed cmake

* fixed

* fixed client_example

* format

* fixed contraction isSupport

* Update include/ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd_fixed_nk.hpp

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>

* Update device_reduce_threadwise.hpp

* clean

* Fixes

* Fix example

---------

Co-authored-by: Jing Zhang <jizha@amd.com>
Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
This commit is contained in:
zjing14
2024-04-15 21:09:45 -05:00
committed by GitHub
parent db376dd8a4
commit 12865fbf28
45 changed files with 6345 additions and 199 deletions

View File

@@ -4,7 +4,7 @@
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace ck {
namespace tensor_operation {
@@ -92,6 +92,15 @@ struct Add
};
};
struct Scales
{
template <typename Y, typename X0, typename X1>
__host__ __device__ constexpr void operator()(Y& y, const X0& x0, const X1& x1) const
{
y = ck::type_convert<Y>(ck::type_convert<float>(x0) * ck::type_convert<float>(x1));
}
};
struct Max
{
template <typename Y, typename X0, typename X1>
@@ -485,6 +494,19 @@ struct AddFastGelu
e = type_convert<half_t>(x1_f);
}
template <>
__host__ __device__ constexpr void
operator()<bhalf_t, bhalf_t, bhalf_t>(bhalf_t& e, const bhalf_t& c, const bhalf_t& d) const
{
const float x0_f = type_convert<float>(c) + type_convert<float>(d);
float x1_f = 0;
FastGelu{}.template operator()<float, float>(x1_f, x0_f);
e = type_convert<bhalf_t>(x1_f);
}
template <>
__host__ __device__ constexpr void
operator()<bhalf_t, float, bhalf_t>(bhalf_t& e, const float& c, const bhalf_t& d) const

View File

@@ -14,6 +14,8 @@ namespace element_wise {
template <typename... UnaryOpsSet>
struct UnaryCombinedOp
{
__host__ __device__ UnaryCombinedOp() : unary_ops_() {}
__host__ __device__ UnaryCombinedOp(UnaryOpsSet... unary_ops) : unary_ops_(unary_ops...) {}
template <typename Y, typename X>
@@ -32,6 +34,8 @@ struct UnaryCombinedOp
template <typename BinaryOp, typename UnaryOp0, typename UnaryOp1>
struct BinaryWithUnaryCombinedOp
{
__host__ __device__ BinaryWithUnaryCombinedOp() : binary_op_(), unary_op0_(), unary_op1_() {}
__host__ __device__ BinaryWithUnaryCombinedOp(BinaryOp binary_op,
UnaryOp0 unary_op0,
UnaryOp1 unary_op1)
@@ -63,6 +67,11 @@ template <typename BinaryOp0,
typename UnaryOp2>
struct TrinaryWithUnaryCombinedOp
{
__host__ __device__ TrinaryWithUnaryCombinedOp()
: binary_op0_(), binary_op1_(), unary_op0_(), unary_op1_(), unary_op2_()
{
}
__host__ __device__ TrinaryWithUnaryCombinedOp(BinaryOp0 binary_op0,
BinaryOp0 binary_op1,
UnaryOp0 unary_op0,

View File

@@ -288,10 +288,13 @@ struct ConvertF8RNE
struct Scale
{
__host__ __device__ Scale(float scale) : scale_(scale) {}
__host__ __device__ Scale(float scale = 1.f) : scale_(scale) {}
template <typename Y, typename X>
__host__ __device__ void operator()(Y& y, const X& x) const;
__host__ __device__ void operator()(Y& y, const X& x) const
{
y = ck::type_convert<Y>(ck::type_convert<float>(x) * scale_);
}
template <>
__host__ __device__ void operator()<half_t, half_t>(half_t& y, const half_t& x) const
@@ -500,6 +503,36 @@ struct FastGelu
y = type_convert<half_t>(y_f);
}
template <>
__device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
{
float y_f;
this->operator()<float, float>(y_f, x);
y = type_convert<bhalf_t>(y_f);
}
template <>
__device__ void operator()<bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x) const
{
float y_f;
this->operator()<float, float>(y_f, type_convert<float>(x));
y = type_convert<bhalf_t>(y_f);
}
template <>
__host__ void operator()<bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x) const
{
float y_f;
this->operator()<float, float>(y_f, type_convert<float>(x));
y = type_convert<bhalf_t>(y_f);
}
};
// https://paperswithcode.com/method/gelu