Add support to fp16 + compute fp16 and bf16 + compute bf16 contractions (#3598)

* Add support to fp16 + compute fp16 and bf16 + compute bf16 contractions

Enables hipTensor to access the WMMA HW functionalities
for these combinations of datatype on gfx11 and gfx12.

* Fix change to contraction scale tests

* Fix clang-format
This commit is contained in:
Estevan Vedovelli
2026-01-20 12:39:57 -05:00
committed by GitHub
parent 4d58c70e6c
commit 7d8bca7ddc
44 changed files with 2762 additions and 24 deletions

View File

@@ -282,6 +282,58 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_comp
#endif // CK_ENABLE_FP64
#ifdef CK_ENABLE_FP16
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
F16,
F16,
F16_Tuple,
F16,
PassThrough,
PassThrough,
Bilinear,
F16>>>& instances);
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
F16,
F16,
F16_Tuple,
F16,
PassThrough,
PassThrough,
Bilinear,
F16>>>& instances);
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
F16,
F16,
F16_Tuple,
F16,
PassThrough,
PassThrough,
Bilinear,
F16>>>& instances);
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
F16,
F16,
F16_Tuple,
F16,
PassThrough,
PassThrough,
Bilinear,
F16>>>& instances);
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
@@ -336,6 +388,58 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_comp
#endif // CK_ENABLE_FP16
#ifdef CK_ENABLE_BF16
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
BF16,
BF16,
BF16_Tuple,
BF16,
PassThrough,
PassThrough,
Bilinear,
BF16>>>& instances);
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
BF16,
BF16,
BF16_Tuple,
BF16,
PassThrough,
PassThrough,
Bilinear,
BF16>>>& instances);
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
BF16,
BF16,
BF16_Tuple,
BF16,
PassThrough,
PassThrough,
Bilinear,
BF16>>>& instances);
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
BF16,
BF16,
BF16_Tuple,
BF16,
PassThrough,
PassThrough,
Bilinear,
BF16>>>& instances);
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
@@ -654,6 +758,58 @@ void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_comp
#endif // CK_ENABLE_FP64
#ifdef CK_ENABLE_FP16
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F16,
F16,
F16_Tuple,
F16,
PassThrough,
PassThrough,
Bilinear,
F16>>>& instances);
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F16,
F16,
F16_Tuple,
F16,
PassThrough,
PassThrough,
Bilinear,
F16>>>& instances);
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F16,
F16,
F16_Tuple,
F16,
PassThrough,
PassThrough,
Bilinear,
F16>>>& instances);
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F16,
F16,
F16_Tuple,
F16,
PassThrough,
PassThrough,
Bilinear,
F16>>>& instances);
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
@@ -708,6 +864,58 @@ void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_comp
#endif // CK_ENABLE_FP16
#ifdef CK_ENABLE_BF16
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
BF16,
BF16,
BF16_Tuple,
BF16,
PassThrough,
PassThrough,
Bilinear,
BF16>>>& instances);
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
BF16,
BF16,
BF16_Tuple,
BF16,
PassThrough,
PassThrough,
Bilinear,
BF16>>>& instances);
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
BF16,
BF16,
BF16_Tuple,
BF16,
PassThrough,
PassThrough,
Bilinear,
BF16>>>& instances);
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
BF16,
BF16,
BF16_Tuple,
BF16,
PassThrough,
PassThrough,
Bilinear,
BF16>>>& instances);
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
@@ -938,7 +1146,18 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
{
if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
{
if constexpr(is_same_v<ComputeDataType, float>)
if constexpr(is_same_v<ComputeDataType, ck::half_t>)
{
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance(
op_ptrs);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance(
op_ptrs);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance(
op_ptrs);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance(
op_ptrs);
}
else if constexpr(is_same_v<ComputeDataType, float>)
{
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
op_ptrs);
@@ -952,7 +1171,18 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
}
else if constexpr(NumDimM == 6 && NumDimN == 6 && NumDimK == 6)
{
if constexpr(is_same_v<ComputeDataType, float>)
if constexpr(is_same_v<ComputeDataType, ck::half_t>)
{
add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance(
op_ptrs);
add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance(
op_ptrs);
add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance(
op_ptrs);
add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance(
op_ptrs);
}
else if constexpr(is_same_v<ComputeDataType, float>)
{
add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
op_ptrs);
@@ -972,7 +1202,18 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
{
if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
{
if constexpr(is_same_v<ComputeDataType, float>)
if constexpr(is_same_v<ComputeDataType, ck::bhalf_t>)
{
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance(
op_ptrs);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance(
op_ptrs);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance(
op_ptrs);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance(
op_ptrs);
}
else if constexpr(is_same_v<ComputeDataType, float>)
{
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
op_ptrs);
@@ -986,7 +1227,18 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
}
else if constexpr(NumDimM == 6 && NumDimN == 6 && NumDimK == 6)
{
if constexpr(is_same_v<ComputeDataType, float>)
if constexpr(is_same_v<ComputeDataType, ck::bhalf_t>)
{
add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance(
op_ptrs);
add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance(
op_ptrs);
add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance(
op_ptrs);
add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance(
op_ptrs);
}
else if constexpr(is_same_v<ComputeDataType, float>)
{
add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
op_ptrs);

View File

@@ -282,6 +282,58 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32
#endif // CK_ENABLE_FP64
#ifdef CK_ENABLE_FP16
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
F16,
F16,
Empty_Tuple,
F16,
PassThrough,
PassThrough,
Scale,
F16>>>& instances);
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
F16,
F16,
Empty_Tuple,
F16,
PassThrough,
PassThrough,
Scale,
F16>>>& instances);
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
F16,
F16,
Empty_Tuple,
F16,
PassThrough,
PassThrough,
Scale,
F16>>>& instances);
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
F16,
F16,
Empty_Tuple,
F16,
PassThrough,
PassThrough,
Scale,
F16>>>& instances);
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
@@ -336,6 +388,58 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32
#endif // CK_ENABLE_FP16
#ifdef CK_ENABLE_BF16
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
BF16,
BF16,
Empty_Tuple,
BF16,
PassThrough,
PassThrough,
Scale,
BF16>>>& instances);
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
BF16,
BF16,
Empty_Tuple,
BF16,
PassThrough,
PassThrough,
Scale,
BF16>>>& instances);
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
BF16,
BF16,
Empty_Tuple,
BF16,
PassThrough,
PassThrough,
Scale,
BF16>>>& instances);
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
BF16,
BF16,
Empty_Tuple,
BF16,
PassThrough,
PassThrough,
Scale,
BF16>>>& instances);
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
@@ -654,6 +758,58 @@ void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32
#endif // CK_ENABLE_FP64
#ifdef CK_ENABLE_FP16
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F16,
F16,
Empty_Tuple,
F16,
PassThrough,
PassThrough,
Scale,
F16>>>& instances);
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F16,
F16,
Empty_Tuple,
F16,
PassThrough,
PassThrough,
Scale,
F16>>>& instances);
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F16,
F16,
Empty_Tuple,
F16,
PassThrough,
PassThrough,
Scale,
F16>>>& instances);
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F16,
F16,
Empty_Tuple,
F16,
PassThrough,
PassThrough,
Scale,
F16>>>& instances);
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
@@ -708,6 +864,58 @@ void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32
#endif // CK_ENABLE_FP16
#ifdef CK_ENABLE_BF16
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
BF16,
BF16,
Empty_Tuple,
BF16,
PassThrough,
PassThrough,
Scale,
BF16>>>& instances);
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
BF16,
BF16,
Empty_Tuple,
BF16,
PassThrough,
PassThrough,
Scale,
BF16>>>& instances);
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
BF16,
BF16,
Empty_Tuple,
BF16,
PassThrough,
PassThrough,
Scale,
BF16>>>& instances);
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
BF16,
BF16,
Empty_Tuple,
BF16,
PassThrough,
PassThrough,
Scale,
BF16>>>& instances);
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
@@ -759,7 +967,7 @@ void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_
PassThrough,
Scale,
F32>>>& instances);
#endif // CK_ENABLE_FP16
#endif // CK_ENABLE_BF16
// Contraction + Scale
template <index_t NumDimM,
@@ -937,7 +1145,18 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
{
if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
{
if constexpr(is_same_v<ComputeDataType, float>)
if constexpr(is_same_v<ComputeDataType, ck::half_t>)
{
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance(
op_ptrs);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance(
op_ptrs);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance(
op_ptrs);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance(
op_ptrs);
}
else if constexpr(is_same_v<ComputeDataType, float>)
{
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance(
op_ptrs);
@@ -951,7 +1170,18 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
}
else if constexpr(NumDimM == 6 && NumDimN == 6 && NumDimK == 6)
{
if constexpr(is_same_v<ComputeDataType, float>)
if constexpr(is_same_v<ComputeDataType, ck::half_t>)
{
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance(
op_ptrs);
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance(
op_ptrs);
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance(
op_ptrs);
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance(
op_ptrs);
}
else if constexpr(is_same_v<ComputeDataType, float>)
{
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance(
op_ptrs);
@@ -971,7 +1201,18 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
{
if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
{
if constexpr(is_same_v<ComputeDataType, float>)
if constexpr(is_same_v<ComputeDataType, ck::bhalf_t>)
{
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance(
op_ptrs);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance(
op_ptrs);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance(
op_ptrs);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance(
op_ptrs);
}
else if constexpr(is_same_v<ComputeDataType, float>)
{
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance(
op_ptrs);
@@ -985,6 +1226,17 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
}
else if constexpr(NumDimM == 6 && NumDimN == 6 && NumDimK == 6)
{
if constexpr(is_same_v<ComputeDataType, ck::bhalf_t>)
{
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance(
op_ptrs);
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance(
op_ptrs);
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance(
op_ptrs);
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance(
op_ptrs);
}
if constexpr(is_same_v<ComputeDataType, float>)
{
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance(

View File

@@ -0,0 +1,58 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance =
device_contraction_kk_instance<BF16,
BF16,
F32,
BF16,
BF16_Tuple,
BF16,
BF16,
PassThrough,
PassThrough,
Bilinear,
2>;
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
BF16,
BF16,
BF16_Tuple,
BF16,
PassThrough,
PassThrough,
Bilinear,
BF16>>>& instances)
{
add_device_operation_instances(
instances,
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,58 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance =
device_contraction_kn_instance<BF16,
BF16,
F32,
BF16,
BF16_Tuple,
BF16,
BF16,
PassThrough,
PassThrough,
Bilinear,
2>;
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
BF16,
BF16,
BF16_Tuple,
BF16,
PassThrough,
PassThrough,
Bilinear,
BF16>>>& instances)
{
add_device_operation_instances(
instances,
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,58 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance =
device_contraction_mk_instance<BF16,
BF16,
F32,
BF16,
BF16_Tuple,
BF16,
BF16,
PassThrough,
PassThrough,
Bilinear,
2>;
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
BF16,
BF16,
BF16_Tuple,
BF16,
PassThrough,
PassThrough,
Bilinear,
BF16>>>& instances)
{
add_device_operation_instances(
instances,
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,58 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance =
device_contraction_mn_instance<BF16,
BF16,
F32,
BF16,
BF16_Tuple,
BF16,
BF16,
PassThrough,
PassThrough,
Bilinear,
2>;
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
BF16,
BF16,
BF16_Tuple,
BF16,
PassThrough,
PassThrough,
Bilinear,
BF16>>>& instances)
{
add_device_operation_instances(
instances,
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,58 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance =
device_contraction_kk_instance<F16,
F16,
F32,
F16,
F16_Tuple,
F16,
F16,
PassThrough,
PassThrough,
Bilinear,
2>;
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
F16,
F16,
F16_Tuple,
F16,
PassThrough,
PassThrough,
Bilinear,
F16>>>& instances)
{
add_device_operation_instances(
instances,
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,58 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance =
device_contraction_kn_instance<F16,
F16,
F32,
F16,
F16_Tuple,
F16,
F16,
PassThrough,
PassThrough,
Bilinear,
2>;
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
F16,
F16,
F16_Tuple,
F16,
PassThrough,
PassThrough,
Bilinear,
F16>>>& instances)
{
add_device_operation_instances(
instances,
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,58 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance =
device_contraction_mk_instance<F16,
F16,
F32,
F16,
F16_Tuple,
F16,
F16,
PassThrough,
PassThrough,
Bilinear,
2>;
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
F16,
F16,
F16_Tuple,
F16,
PassThrough,
PassThrough,
Bilinear,
F16>>>& instances)
{
add_device_operation_instances(
instances,
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,58 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance =
device_contraction_mn_instance<F16,
F16,
F32,
F16,
F16_Tuple,
F16,
F16,
PassThrough,
PassThrough,
Bilinear,
2>;
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
F16,
F16,
F16_Tuple,
F16,
PassThrough,
PassThrough,
Bilinear,
F16>>>& instances)
{
add_device_operation_instances(
instances,
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,58 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance =
device_contraction_kk_instance<BF16,
BF16,
F32,
BF16,
BF16_Tuple,
BF16,
BF16,
PassThrough,
PassThrough,
Bilinear,
6>;
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
BF16,
BF16,
BF16_Tuple,
BF16,
PassThrough,
PassThrough,
Bilinear,
BF16>>>& instances)
{
add_device_operation_instances(
instances,
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,58 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance =
device_contraction_kn_instance<BF16,
BF16,
F32,
BF16,
BF16_Tuple,
BF16,
BF16,
PassThrough,
PassThrough,
Bilinear,
6>;
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
BF16,
BF16,
BF16_Tuple,
BF16,
PassThrough,
PassThrough,
Bilinear,
BF16>>>& instances)
{
add_device_operation_instances(
instances,
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,58 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance =
device_contraction_mk_instance<BF16,
BF16,
F32,
BF16,
BF16_Tuple,
BF16,
BF16,
PassThrough,
PassThrough,
Bilinear,
6>;
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
BF16,
BF16,
BF16_Tuple,
BF16,
PassThrough,
PassThrough,
Bilinear,
BF16>>>& instances)
{
add_device_operation_instances(
instances,
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,58 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance =
device_contraction_mn_instance<BF16,
BF16,
F32,
BF16,
BF16_Tuple,
BF16,
BF16,
PassThrough,
PassThrough,
Bilinear,
6>;
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
BF16,
BF16,
BF16_Tuple,
BF16,
PassThrough,
PassThrough,
Bilinear,
BF16>>>& instances)
{
add_device_operation_instances(
instances,
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,60 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance =
device_contraction_kk_instance<F16,
F16,
F32,
F16,
F16_Tuple,
F16,
F16,
PassThrough,
PassThrough,
Bilinear,
6>;
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F16,
F16,
F16_Tuple,
F16,
PassThrough,
PassThrough,
Bilinear,
F16>>>& instances)
{
printf("[CK_DEBUG] f16+f16+f16+f16_kknn_instance: before add, size=%zu\n", instances.size());
add_device_operation_instances(
instances,
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance{});
printf("[CK_DEBUG] f16+f16+f16+f16_kknn_instance: after add, size=%zu\n", instances.size());
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,58 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance =
device_contraction_kn_instance<F16,
F16,
F32,
F16,
F16_Tuple,
F16,
F16,
PassThrough,
PassThrough,
Bilinear,
6>;
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F16,
F16,
F16_Tuple,
F16,
PassThrough,
PassThrough,
Bilinear,
F16>>>& instances)
{
add_device_operation_instances(
instances,
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,58 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance =
device_contraction_mk_instance<F16,
F16,
F32,
F16,
F16_Tuple,
F16,
F16,
PassThrough,
PassThrough,
Bilinear,
6>;
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F16,
F16,
F16_Tuple,
F16,
PassThrough,
PassThrough,
Bilinear,
F16>>>& instances)
{
add_device_operation_instances(
instances,
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,58 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance =
device_contraction_mn_instance<F16,
F16,
F32,
F16,
F16_Tuple,
F16,
F16,
PassThrough,
PassThrough,
Bilinear,
6>;
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F16,
F16,
F16_Tuple,
F16,
PassThrough,
PassThrough,
Bilinear,
F16>>>& instances)
{
add_device_operation_instances(
instances,
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -37,12 +37,22 @@ foreach(idx IN LISTS DIMS)
${PREFIX}_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp)
# FP16
list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES ${PREFIX}_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp
${PREFIX}_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp
${PREFIX}_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp
${PREFIX}_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp)
list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES ${PREFIX}_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
${PREFIX}_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
${PREFIX}_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
${PREFIX}_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp)
# BF16
list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES ${PREFIX}_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp
${PREFIX}_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp
${PREFIX}_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp
${PREFIX}_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp)
list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES ${PREFIX}_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
${PREFIX}_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
${PREFIX}_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp

View File

@@ -0,0 +1,57 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance =
device_contraction_kk_instance<BF16,
BF16,
F32,
BF16,
Empty_Tuple,
BF16,
BF16,
PassThrough,
PassThrough,
Scale,
2>;
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
BF16,
BF16,
Empty_Tuple,
BF16,
PassThrough,
PassThrough,
Scale,
BF16>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,57 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance =
device_contraction_kn_instance<BF16,
BF16,
F32,
BF16,
Empty_Tuple,
BF16,
BF16,
PassThrough,
PassThrough,
Scale,
2>;
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
BF16,
BF16,
Empty_Tuple,
BF16,
PassThrough,
PassThrough,
Scale,
BF16>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,57 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance =
device_contraction_mk_instance<BF16,
BF16,
F32,
BF16,
Empty_Tuple,
BF16,
BF16,
PassThrough,
PassThrough,
Scale,
2>;
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
BF16,
BF16,
Empty_Tuple,
BF16,
PassThrough,
PassThrough,
Scale,
BF16>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,57 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance =
device_contraction_mn_instance<BF16,
BF16,
F32,
BF16,
Empty_Tuple,
BF16,
BF16,
PassThrough,
PassThrough,
Scale,
2>;
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
BF16,
BF16,
Empty_Tuple,
BF16,
PassThrough,
PassThrough,
Scale,
BF16>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,57 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance =
device_contraction_kk_instance<F16,
F16,
F32,
F16,
Empty_Tuple,
F16,
F16,
PassThrough,
PassThrough,
Scale,
2>;
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
F16,
F16,
Empty_Tuple,
F16,
PassThrough,
PassThrough,
Scale,
F16>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,57 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance =
device_contraction_kn_instance<F16,
F16,
F32,
F16,
Empty_Tuple,
F16,
F16,
PassThrough,
PassThrough,
Scale,
2>;
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
F16,
F16,
Empty_Tuple,
F16,
PassThrough,
PassThrough,
Scale,
F16>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,57 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance =
device_contraction_mk_instance<F16,
F16,
F32,
F16,
Empty_Tuple,
F16,
F16,
PassThrough,
PassThrough,
Scale,
2>;
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
F16,
F16,
Empty_Tuple,
F16,
PassThrough,
PassThrough,
Scale,
F16>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,57 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance =
device_contraction_mn_instance<F16,
F16,
F32,
F16,
Empty_Tuple,
F16,
F16,
PassThrough,
PassThrough,
Scale,
2>;
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
2,
2,
F16,
F16,
Empty_Tuple,
F16,
PassThrough,
PassThrough,
Scale,
F16>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,57 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance =
device_contraction_kk_instance<BF16,
BF16,
F32,
BF16,
Empty_Tuple,
BF16,
BF16,
PassThrough,
PassThrough,
Scale,
6>;
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
BF16,
BF16,
Empty_Tuple,
BF16,
PassThrough,
PassThrough,
Scale,
BF16>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,57 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance =
device_contraction_kn_instance<BF16,
BF16,
F32,
BF16,
Empty_Tuple,
BF16,
BF16,
PassThrough,
PassThrough,
Scale,
6>;
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
BF16,
BF16,
Empty_Tuple,
BF16,
PassThrough,
PassThrough,
Scale,
BF16>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,57 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance =
device_contraction_mk_instance<BF16,
BF16,
F32,
BF16,
Empty_Tuple,
BF16,
BF16,
PassThrough,
PassThrough,
Scale,
6>;
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
BF16,
BF16,
Empty_Tuple,
BF16,
PassThrough,
PassThrough,
Scale,
BF16>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,57 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance =
device_contraction_mn_instance<BF16,
BF16,
F32,
BF16,
Empty_Tuple,
BF16,
BF16,
PassThrough,
PassThrough,
Scale,
6>;
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
BF16,
BF16,
Empty_Tuple,
BF16,
PassThrough,
PassThrough,
Scale,
BF16>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,57 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance =
device_contraction_kk_instance<F16,
F16,
F32,
F16,
Empty_Tuple,
F16,
F16,
PassThrough,
PassThrough,
Scale,
6>;
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F16,
F16,
Empty_Tuple,
F16,
PassThrough,
PassThrough,
Scale,
F16>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,57 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance =
device_contraction_kn_instance<F16,
F16,
F32,
F16,
Empty_Tuple,
F16,
F16,
PassThrough,
PassThrough,
Scale,
6>;
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F16,
F16,
Empty_Tuple,
F16,
PassThrough,
PassThrough,
Scale,
F16>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,57 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance =
device_contraction_mk_instance<F16,
F16,
F32,
F16,
Empty_Tuple,
F16,
F16,
PassThrough,
PassThrough,
Scale,
6>;
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F16,
F16,
Empty_Tuple,
F16,
PassThrough,
PassThrough,
Scale,
F16>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -0,0 +1,57 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance =
device_contraction_mn_instance<F16,
F16,
F32,
F16,
Empty_Tuple,
F16,
F16,
PassThrough,
PassThrough,
Scale,
6>;
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F16,
F16,
Empty_Tuple,
F16,
PassThrough,
PassThrough,
Scale,
F16>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -37,12 +37,22 @@ foreach(idx IN LISTS DIMS)
${PREFIX}_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp)
# FP16
list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES ${PREFIX}_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp
${PREFIX}_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp
${PREFIX}_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp
${PREFIX}_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp)
list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES ${PREFIX}_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
${PREFIX}_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
${PREFIX}_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
${PREFIX}_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp)
# BF16
list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES ${PREFIX}_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp
${PREFIX}_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp
${PREFIX}_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp
${PREFIX}_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp)
list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES ${PREFIX}_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
${PREFIX}_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
${PREFIX}_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp