From f0b6fdcadb9567cab74498070aa1d9b3791f5951 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= <188998872+vpietila-amd@users.noreply.github.com> Date: Tue, 28 Oct 2025 07:19:29 +0200 Subject: [PATCH] Add name member to CK elementwise operations. (#3102) [ROCm/composable_kernel commit: 1c17bae816edc44c32ee9d1a19d79d768fd1be13] --- .../element/binary_element_wise_operation.hpp | 20 +++++ .../combined_element_wise_operation.hpp | 6 ++ .../gpu/element/element_wise_operation.hpp | 25 ++++++ .../gpu/element/quantization_operation.hpp | 14 +++ .../element/unary_element_wise_operation.hpp | 88 +++++++++++++++++++ 5 files changed, 153 insertions(+) diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp index 4954144aca..10a9a4dbae 100644 --- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp @@ -96,6 +96,8 @@ struct Add struct Max { + static constexpr const char* name = "Max"; + template __host__ __device__ void operator()(Y& y, const X0& x0, const X1& x1) const { @@ -107,6 +109,8 @@ struct Max struct Min { + static constexpr const char* name = "Min"; + template __host__ __device__ void operator()(Y& y, const X0& x0, const X1& x1) const { @@ -118,6 +122,8 @@ struct Min struct Multiply { + static constexpr const char* name = "Multiply"; + template __host__ __device__ constexpr void operator()(Y& y, const X0& x0, const X1& x1) const; @@ -210,6 +216,8 @@ struct Multiply struct ScaleAdd { + static constexpr const char* name = "ScaleAdd"; + __host__ __device__ ScaleAdd(float scale = 1.f) : scale_(scale) {} template @@ -237,6 +245,8 @@ struct ScaleAdd struct Subtract { + static constexpr const char* name = "Subtract"; + template __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const; @@ -531,6 +541,8 @@ struct AddRelu struct AddHardswish { + static constexpr const char* name = "AddHardswish"; + template __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const; @@ -568,6 +580,8 @@ struct AddHardswish // E = FastGelu(C + D) struct AddFastGelu { + static constexpr const char* name = "AddFastGelu"; + template __host__ __device__ constexpr void operator()(E& e, const C& c, const D& d) const; @@ -633,6 +647,8 @@ struct AddFastGelu // E = MultiplyFastGelu(C + D) struct MultiplyFastGelu { + static constexpr const char* name = "MultiplyFastGelu"; + template __host__ __device__ constexpr void operator()(E& e, const C& c, const D& d) const; @@ -698,6 +714,8 @@ struct MultiplyFastGelu // E = Silu(C + D) struct AddSilu { + static constexpr const char* name = "AddSilu"; + template __host__ __device__ constexpr void operator()(E& e, const C& c, const D& d) const; @@ -748,6 +766,8 @@ struct AddSilu struct ConvScaleAdd { + static constexpr const char* name = "ConvScaleAdd"; + __host__ __device__ ConvScaleAdd(float scale_in = 1.f, float scale_wei = 1.f, float scale_out = 1.f) diff --git a/include/ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp index 3cc1c3c42c..083327b8f0 100644 --- a/include/ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp @@ -13,6 +13,8 @@ namespace element_wise { template struct UnaryCombinedOp { + static constexpr const char* name = "UnaryCombinedOp"; + __host__ __device__ UnaryCombinedOp() : unary_ops_() {} __host__ __device__ UnaryCombinedOp(UnaryOpsSet... unary_ops) : unary_ops_(unary_ops...) {} @@ -33,6 +35,8 @@ struct UnaryCombinedOp template struct BinaryWithUnaryCombinedOp { + static constexpr const char* name = "BinaryWithUnaryCombinedOp"; + __host__ __device__ BinaryWithUnaryCombinedOp() : binary_op_(), unary_op0_(), unary_op1_() {} __host__ __device__ BinaryWithUnaryCombinedOp(BinaryOp binary_op, @@ -66,6 +70,8 @@ template struct TrinaryWithUnaryCombinedOp { + static constexpr const char* name = "TrinaryWithUnaryCombinedOp"; + __host__ __device__ TrinaryWithUnaryCombinedOp() : binary_op0_(), binary_op1_(), unary_op0_(), unary_op1_(), unary_op2_() { diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp index 5edcdd257b..709f1bf6aa 100644 --- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp @@ -33,6 +33,8 @@ namespace element_wise { struct AddReluAdd { + static constexpr const char* name = "AddReluAdd"; + template __host__ __device__ constexpr void operator()(Y&, const X0&, const X1&, const X2&) const; @@ -102,6 +104,8 @@ struct AddReluAdd struct AddHardswishAdd { + static constexpr const char* name = "AddHardswishAdd"; + template __host__ __device__ constexpr void operator()(Y&, const X0&, const X1&, const X2&) const; @@ -134,6 +138,8 @@ struct AddHardswishAdd // E = C + D0 + D1 struct AddAdd { + static constexpr const char* name = "AddAdd"; + template __host__ __device__ void operator()(E& e, const C& c, const D0& d0, const D1& d1) const { @@ -163,6 +169,8 @@ struct AddAdd // E = (C + D0) x D1 struct AddMultiply { + static constexpr const char* name = "AddMultiply"; + template __host__ __device__ void operator()(E& e, const C& c, const D0& d0, const D1& d1) const; @@ -199,6 +207,8 @@ struct AddMultiply // E = C x D0 + D1 struct MultiplyAdd { + static constexpr const char* name = "MultiplyAdd"; + template __host__ __device__ void operator()(E& e, const C& c, const D0& d0, const D1& d1) const; @@ -251,6 +261,8 @@ struct MultiplyAdd struct MultiplyMultiply { + static constexpr const char* name = "MultiplyMultiply"; + template __host__ __device__ constexpr void operator()(E& e, const C& c, const D0& d0, const D1& d1) const; @@ -306,6 +318,8 @@ struct MultiplyMultiply struct MultiplyAddFastGelu { + static constexpr const char* name = "MultiplyAddFastGelu"; + template __host__ __device__ constexpr void operator()(E& e, const C& c, const D0& d0, const D1& d1) const; @@ -327,6 +341,8 @@ struct MultiplyAddFastGelu // E = FastGelu(C + D0 + D1) struct AddAddFastGelu { + static constexpr const char* name = "AddAddFastGelu"; + template __host__ __device__ constexpr void operator()(E& e, const C& c, const D0& d0, const D1& d1) const; @@ -398,6 +414,7 @@ struct AddAddFastGelu // E = Relu(alpha1 * C + alpha2 * D0 + D1) struct ScaleAddScaleAddRelu { + static constexpr const char* name = "ScaleAddScaleAddRelu"; ScaleAddScaleAddRelu(const float alpha1 = 1.f, const float alpha2 = 1.f) : alpha1_(alpha1), alpha2_(alpha2) @@ -462,6 +479,8 @@ struct ScaleAddScaleAddRelu struct Normalize { + static constexpr const char* name = "Normalize"; + // FIXME: is double absolutely necessary? Normalize(double epsilon = 1e-4) : epsilon_(epsilon) {} @@ -533,6 +552,8 @@ struct Normalize // The data type of mean and variance is used as AccDataType struct NormalizeInInfer { + static constexpr const char* name = "NormalizeInInfer"; + NormalizeInInfer(double epsilon = 1e-4) : epsilon_(epsilon) {} template @@ -622,6 +643,8 @@ struct UnaryTypeConvert; template <> struct UnaryTypeConvert { + static constexpr const char* name = "UnaryTypeConvert"; + __host__ __device__ void operator()(float& y, ck::bhalf_t& x) const { y = ck::type_convert(x); @@ -631,6 +654,8 @@ struct UnaryTypeConvert template <> struct UnaryTypeConvert { + static constexpr const char* name = "UnaryTypeConvert"; + __host__ __device__ void operator()(ck::bhalf_t& y, float& x) const { y = ck::type_convert(x); diff --git a/include/ck/tensor_operation/gpu/element/quantization_operation.hpp b/include/ck/tensor_operation/gpu/element/quantization_operation.hpp index fefa6c793f..a5cbfbb2fc 100644 --- a/include/ck/tensor_operation/gpu/element/quantization_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/quantization_operation.hpp @@ -24,6 +24,8 @@ namespace element_wise { template struct Activation_Mul_Clamp { + static constexpr const char* name = "Activation_Mul_Clamp"; + // Convolution + Activation (piecewise linear function) // If an activation is piecewise linear function, then Activation(Sy * Qy) = Sy * Activation(Qy) // Z = Activation(Y) = Activation(W @ X) @@ -71,6 +73,8 @@ struct Activation_Mul_Clamp template struct Mul_Activation_Mul_Clamp { + static constexpr const char* name = "Mul_Activation_Mul_Clamp"; + // Convolution + Activation (non piecewise linear function) // Z = Activation(Y) = Activation(W @ X) // Sz * Qz = Activation(Sy * Qy) @@ -101,6 +105,8 @@ struct Mul_Activation_Mul_Clamp template struct Activation_Mul2_Clamp { + static constexpr const char* name = "Activation_Mul2_Clamp"; + Activation_Mul2_Clamp(Activation activationOp) : activationOp_(activationOp) {} __host__ __device__ constexpr void @@ -131,6 +137,8 @@ struct Activation_Mul2_Clamp template struct Add_Activation_Mul_Clamp { + static constexpr const char* name = "Add_Activation_Mul_Clamp"; + // Convolution + bias // Let Bias = B = Sw * Sx * Qb // Where Qb is int32 @@ -175,6 +183,8 @@ struct Add_Activation_Mul_Clamp template struct Add_Activation_Mul2_Clamp { + static constexpr const char* name = "Add_Activation_Mul2_Clamp"; + Add_Activation_Mul2_Clamp(Activation activationOp) : activationOp_(activationOp) {} __host__ __device__ constexpr void @@ -206,6 +216,8 @@ struct Add_Activation_Mul2_Clamp template struct Add_Mul_Activation_Mul_Clamp { + static constexpr const char* name = "Add_Mul_Activation_Mul_Clamp"; + // Convolution + Activation (non piecewise linear function) // Z = Activation(Y) = Activation(W @ X + B) // Sz * Qz = Activation(Sy * Qy) @@ -250,6 +262,8 @@ struct Add_Mul_Activation_Mul_Clamp template struct Add_Mul2_Activation_Mul_Clamp { + static constexpr const char* name = "Add_Mul2_Activation_Mul_Clamp"; + Add_Mul2_Activation_Mul_Clamp(float scale_z_inv, Activation activationOp) : scale_z_inv_(scale_z_inv), activationOp_(activationOp) { diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index 59292b30e2..a0b33afb4d 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -157,6 +157,8 @@ namespace element_wise { struct PassThroughPack8 { + static constexpr const char* name = "PassThroughPack8"; + template __host__ __device__ void operator()(Y& y, const X& x) const; @@ -265,6 +267,8 @@ struct PassThroughPack8 struct DequantPack8 { + static constexpr const char* name = "DequantPack8"; + template __host__ __device__ void operator()(Y& y, const X& x, const Z& z) const; @@ -301,6 +305,8 @@ struct DequantPack8 struct PassThroughPack2 { + static constexpr const char* name = "PassThroughPack2"; + template __host__ __device__ void operator()(Y& y, const X& x) const; @@ -558,6 +564,8 @@ struct PassThrough struct UnaryConvert { + static constexpr const char* name = "UnaryConvert"; + template __host__ __device__ void operator()(Y& y, const X& x) const { @@ -567,6 +575,8 @@ struct UnaryConvert struct ConvertBF16RTN { + static constexpr const char* name = "ConvertBF16RTN"; + // convert to bf16 using round to nearest (rtn) template __host__ __device__ void operator()(Y& y, const X& x) const @@ -584,6 +594,8 @@ struct ConvertBF16RTN struct ConvertF8SR { + static constexpr const char* name = "ConvertF8SR"; + // convert to fp8 using stochastic rounding (SR) template __host__ __device__ void operator()(Y& y, const X& x) const @@ -602,6 +614,8 @@ struct ConvertF8SR struct ConvertF8RNE { + static constexpr const char* name = "ConvertF8RNE"; + // convert to fp8 using rounding to nearest even template __host__ __device__ void operator()(Y& y, const X& x) const @@ -667,6 +681,8 @@ struct Scale struct ScaleAndResetNaNToMinusInfinity { + static constexpr const char* name = "ScaleAndResetNaNToMinusInfinity"; + __host__ __device__ ScaleAndResetNaNToMinusInfinity(float scale) : scale_(scale) {} template @@ -683,6 +699,8 @@ struct ScaleAndResetNaNToMinusInfinity struct UnaryDivide { + static constexpr const char* name = "UnaryDivide"; + __host__ __device__ UnaryDivide(const int32_t divider = 1) : divider_(divider) {} template @@ -727,6 +745,8 @@ struct UnaryDivide struct UnarySquare { + static constexpr const char* name = "UnarySquare"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -743,6 +763,8 @@ struct UnarySquare struct UnaryAbs { + static constexpr const char* name = "UnaryAbs"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -773,6 +795,8 @@ struct UnaryAbs struct UnarySqrt { + static constexpr const char* name = "UnarySqrt"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -898,6 +922,8 @@ struct Relu // gpu code use lower accuracy "_ocml_exp_f32" and "rcp" function struct FastGelu { + static constexpr const char* name = "FastGelu"; + template __host__ void operator()(Y& y, const X& x) const; @@ -1013,6 +1039,8 @@ struct FastGelu // y = 0.5*x*(1+erf(x/sqrt(2))) struct Gelu { + static constexpr const char* name = "Gelu"; + template __host__ __device__ void operator()(Y& y, const X& x) const; @@ -1031,6 +1059,8 @@ struct Gelu struct Sigmoid { + static constexpr const char* name = "Sigmoid"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -1055,6 +1085,8 @@ struct Sigmoid struct Silu { + static constexpr const char* name = "SiLU"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -1068,6 +1100,8 @@ struct Silu struct TanH { + static constexpr const char* name = "TanH"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -1091,6 +1125,8 @@ struct TanH struct ACos { + static constexpr const char* name = "ACos"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -1105,6 +1141,8 @@ struct ACos struct Neg { + static constexpr const char* name = "Neg"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -1119,6 +1157,8 @@ struct Neg struct ATan { + static constexpr const char* name = "ATan"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -1133,6 +1173,8 @@ struct ATan struct Sin { + static constexpr const char* name = "Sin"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -1147,6 +1189,8 @@ struct Sin struct ASinH { + static constexpr const char* name = "ASinH"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -1161,6 +1205,8 @@ struct ASinH struct Cos { + static constexpr const char* name = "Cos"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -1175,6 +1221,8 @@ struct Cos struct ACosH { + static constexpr const char* name = "ACosH"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -1189,6 +1237,8 @@ struct ACosH struct Tan { + static constexpr const char* name = "Tan"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -1203,6 +1253,8 @@ struct Tan struct ATanH { + static constexpr const char* name = "ATanH"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -1217,6 +1269,8 @@ struct ATanH struct SinH { + static constexpr const char* name = "SinH"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -1231,6 +1285,8 @@ struct SinH struct Ceil { + static constexpr const char* name = "Ceil"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -1245,6 +1301,8 @@ struct Ceil struct Exp { + static constexpr const char* name = "Exp"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -1259,6 +1317,8 @@ struct Exp struct CosH { + static constexpr const char* name = "CosH"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -1273,6 +1333,8 @@ struct CosH struct Floor { + static constexpr const char* name = "Floor"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -1287,6 +1349,8 @@ struct Floor struct Log { + static constexpr const char* name = "Log"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -1301,6 +1365,8 @@ struct Log struct ASin { + static constexpr const char* name = "ASin"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -1315,6 +1381,8 @@ struct ASin struct Rcp { + static constexpr const char* name = "Rcp"; + template __host__ __device__ void operator()(T& y, const T& x) const { @@ -1329,6 +1397,8 @@ struct Rcp struct Swish { + static constexpr const char* name = "Swish"; + Swish(float beta = 1.0f) : beta_(beta) {} template @@ -1358,6 +1428,8 @@ struct Swish struct SoftRelu { + static constexpr const char* name = "SoftRelu"; + SoftRelu(float alpha = 1.f) : alpha_(alpha){}; template @@ -1386,6 +1458,8 @@ struct SoftRelu struct Power { + static constexpr const char* name = "Power"; + Power(float alpha = 0.f, float beta = 1.f, float gamma = 2.f) : alpha_(alpha), beta_(beta), gamma_(gamma){}; @@ -1420,6 +1494,8 @@ struct Power struct ClippedRelu { + static constexpr const char* name = "ClippedRelu"; + ClippedRelu(float alpha = 0.f, float beta = 1.f) : alpha_(alpha), beta_(beta){}; template @@ -1449,6 +1525,8 @@ struct ClippedRelu struct LeakyRelu { + static constexpr const char* name = "LeakyRelu"; + LeakyRelu(float alpha = 0.01f) : alpha_(alpha){}; template @@ -1476,6 +1554,8 @@ struct LeakyRelu struct Elu { + static constexpr const char* name = "Elu"; + Elu(float alpha = 1.f) : alpha_(alpha){}; template @@ -1503,6 +1583,8 @@ struct Elu struct Logistic { + static constexpr const char* name = "Logistic"; + Logistic(float alpha = 1.f) : alpha_(alpha){}; template @@ -1531,6 +1613,8 @@ struct Logistic struct ConvInvscale { + static constexpr const char* name = "ConvInvscale"; + __host__ __device__ ConvInvscale(float scale_in = 1.f, float scale_wei = 1.f, float scale_out = 1.f) @@ -1554,6 +1638,8 @@ struct ConvInvscale struct ConvScale { + static constexpr const char* name = "ConvScale"; + __host__ __device__ ConvScale(float scale_in = 1.f, float scale_wei = 1.f, float scale_out = 1.f) @@ -1577,6 +1663,8 @@ struct ConvScale struct ConvScaleRelu { + static constexpr const char* name = "ConvScaleRelu"; + __host__ __device__ ConvScaleRelu(float scale_in = 1.f, float scale_wei = 1.f, float scale_out = 1.f)