From 8655ba989ccd3b1b5d2590828e157299c777b3bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Mon, 4 Aug 2025 16:49:55 +0200 Subject: [PATCH 01/21] Mark non-grouped convolutions instances as deprecated (#2595) * Mark non-grouped convolutions instances as deprecated * Update CHANGELOG.md Co-authored-by: John Afaganis * Update library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp Co-authored-by: John Afaganis --------- Co-authored-by: John Afaganis --- CHANGELOG.md | 4 ++++ ...vice_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp | 8 +++++++- ...evice_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp | 8 +++++++- ...evice_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp | 8 +++++++- ...vice_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp | 8 +++++++- ...ice_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp | 8 +++++++- ...ice_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp | 8 +++++++- ...ce_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp | 8 +++++++- ...e_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp | 8 +++++++- ...ce_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp | 8 +++++++- ...ce_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp | 8 +++++++- ...e_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp | 8 +++++++- ...nv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp | 8 +++++++- ...device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp | 8 +++++++- .../device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp | 8 +++++++- .../device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp | 8 +++++++- ...device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp | 8 +++++++- ...dl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp | 8 +++++++- ..._shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp | 8 +++++++- ...onv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp | 8 +++++++- ...conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp | 8 +++++++- ...conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp | 8 +++++++- ...onv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp | 8 +++++++- 23 files changed, 158 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c054b822a..7a21634b7d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -51,6 +51,10 @@ None None +### Upcoming changes + +* Non-grouped convolutions are deprecated. All of their functionality is supported by grouped convolution. + ## Composable Kernel 1.1.0 for ROCm 6.1.0 ### Additions diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp index e3e90c966d..3c332c3b22 100644 --- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -90,10 +90,16 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are deprecated. They may be removed in a future release." add_device_operation_instances(instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances{}); add_device_operation_instances( instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_bf16_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp index 81e9122d95..aaaeda0312 100644 --- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -83,10 +83,16 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances( DeviceConvBwdData<1, NWC, KXC, NWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances{}); add_device_operation_instances( instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_f16_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp index dbc82168f4..331cc3c4b2 100644 --- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -82,10 +82,16 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances( DeviceConvBwdData<1, NWC, KXC, NWK, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances{}); add_device_operation_instances( instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_f32_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp index 3ac250f3e6..4e51074b3a 100644 --- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -87,10 +87,16 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances{}); add_device_operation_instances( instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_int8_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp index 6ca909c35e..58b3f8e37d 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -71,10 +71,16 @@ void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instances{}); add_device_operation_instances( instances, device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp index d263e98851..a487f0a6f0 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -71,10 +71,16 @@ void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances{}); add_device_operation_instances( instances, device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp index bc949e757c..cfd4f849b8 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -71,10 +71,16 @@ void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances{}); add_device_operation_instances( instances, device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp index 366d1fe160..c2f55d94eb 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -140,6 +140,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances{}); add_device_operation_instances( @@ -149,6 +151,10 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances( add_device_operation_instances( instances, device_conv_dedidecate_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp index 422e37e926..5df1c9cf39 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -142,6 +142,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances{}); add_device_operation_instances( @@ -150,6 +152,10 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances( instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances{}); add_device_operation_instances( instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp index 5993f6bd7a..76ca976e37 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -139,6 +139,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances{}); add_device_operation_instances( @@ -147,6 +149,10 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances( instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances{}); add_device_operation_instances( instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp index 2f079c234c..8221515caa 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -136,6 +136,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances{}); add_device_operation_instances( @@ -144,6 +146,10 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances( instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances{}); add_device_operation_instances( instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp index 86c17aacf0..d7a82fdd2c 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -180,6 +180,8 @@ void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances( DeviceConvFwd<2, NHWC, KYXC, NHWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances{}); add_device_operation_instances( @@ -200,6 +202,10 @@ void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances( add_device_operation_instances( instances, device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_odd_c_f16_instances_2x{}); } +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp index 63c612523f..153b770e1b 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -114,12 +114,18 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances{}); add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_bf16_instances{}); add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp index 0f3b9e7939..fd0c94250f 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -107,11 +107,17 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances( DeviceConvFwd<2, NHWC, KYXC, NHWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances{}); add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_f16_instances{}); add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp index 14f9b5cd6a..038316ac31 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -106,11 +106,17 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances( DeviceConvFwd<2, NHWC, KYXC, NHWK, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances{}); add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_f32_instances{}); add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp index 3f641cdadc..c77c8683c8 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -111,12 +111,18 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances{}); add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_int8_instances{}); add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp index 3402653e84..97830449ee 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -179,6 +179,8 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_odd_c_f16_instanc void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances( std::vector>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances( instances, device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances{}); add_device_operation_instances( @@ -203,6 +205,10 @@ void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances( instances, device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_odd_c_f16_instances_2x{}); } +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp index faac2813ba..e5c682d3cd 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -177,6 +177,8 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_odd_c_f16_ins void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances( std::vector>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances( instances, device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances{}); add_device_operation_instances( @@ -204,6 +206,10 @@ void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instan instances, device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_odd_c_f16_instances_2x{}); } +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp index 94b2a47e50..0b9a6c2b8d 100644 --- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -90,10 +90,16 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances{}); add_device_operation_instances( instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_bf16_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp index 4244ab7b87..6c54552cc8 100644 --- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -90,10 +90,16 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances{}); add_device_operation_instances( instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_f16_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp index 5c7db4ca3b..363e342c1b 100644 --- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -89,10 +89,16 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances{}); add_device_operation_instances( instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_f32_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp index ebc56487a1..35bca49fed 100644 --- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -87,10 +87,16 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances{}); add_device_operation_instances( instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_int8_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance From 15eb493152b4cddff947159ea4b829e1f55c56f3 Mon Sep 17 00:00:00 2001 From: Jinchao Xu Date: Tue, 5 Aug 2025 02:26:08 +0800 Subject: [PATCH 02/21] Add -gsplit-dwarf flag to reduce debug section size and fix ckProfiler link errors (#2611) Resolves R_X86_64_32 relocation out of range errors in grouped conv2d instances by splitting debug information into separate .dwo files. Add explicit cast to avoid signed/unsigned comparison warning. --- include/ck_tile/host/reference/reference_softmax.hpp | 4 ++-- include/ck_tile/host/reference/reference_topk.hpp | 9 +++++---- library/src/tensor_operation_instance/gpu/CMakeLists.txt | 4 ++++ 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/include/ck_tile/host/reference/reference_softmax.hpp b/include/ck_tile/host/reference/reference_softmax.hpp index d86e879944..4e729c437d 100644 --- a/include/ck_tile/host/reference/reference_softmax.hpp +++ b/include/ck_tile/host/reference/reference_softmax.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -14,7 +14,7 @@ CK_TILE_HOST void reference_softmax(const HostTensor& x, HostTensor& y, index_t dim = -1) { index_t rank = x.get_num_of_dimension(); - assert(rank == y.get_num_of_dimension()); + assert(static_cast(rank) == y.get_num_of_dimension()); assert(dim == -1 || dim < rank); index_t target_dim = dim == -1 ? (rank - 1) : dim; diff --git a/include/ck_tile/host/reference/reference_topk.hpp b/include/ck_tile/host/reference/reference_topk.hpp index 3d0404a2e5..0fc99a983a 100644 --- a/include/ck_tile/host/reference/reference_topk.hpp +++ b/include/ck_tile/host/reference/reference_topk.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -38,8 +38,8 @@ CK_TILE_HOST void reference_topk(const HostTensor& x, { // rank must be the same index_t rank = x.get_num_of_dimension(); - assert(rank == y_values.get_num_of_dimension()); - assert(rank == y_indices.get_num_of_dimension()); + assert(static_cast(rank) == y_values.get_num_of_dimension()); + assert(static_cast(rank) == y_indices.get_num_of_dimension()); assert(dim == -1 || dim < rank); index_t topk_dim = dim == -1 ? (rank - 1) : dim; @@ -47,7 +47,8 @@ CK_TILE_HOST void reference_topk(const HostTensor& x, auto x_len = x.get_lengths(); assert(k <= topk_src_len); - assert(k == y_values.get_length(topk_dim) && k == y_indices.get_length(topk_dim)); + assert(static_cast(k) == y_values.get_length(topk_dim) && + static_cast(k) == y_indices.get_length(topk_dim)); index_t n_parallel = x.get_element_size() / topk_src_len; diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index 5204b51edf..1eaaa7e6ba 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -175,6 +175,10 @@ function(add_instance_library INSTANCE_NAME) target_compile_features(${INSTANCE_NAME} PUBLIC) + # splits debug information into separate .dwo files to reduce debug section size + if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + target_compile_options(${INSTANCE_NAME} PRIVATE -gsplit-dwarf) + endif() # flags to compress the library if(NOT DISABLE_OFFLOAD_COMPRESS AND NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132) message(DEBUG "Adding --offload-compress flag for ${INSTANCE_NAME}") From 59245df46d1090bfb1cd438d867c15a300989d63 Mon Sep 17 00:00:00 2001 From: rahjain-amd Date: Mon, 4 Aug 2025 23:58:09 +0530 Subject: [PATCH 03/21] Fix Debug Build for ckProfiler (#2609) Problem ======= relocation R_X86_64_32 out of range: 5405348154 is not in [0, 4294967295] Solution ======== The problem was caused due the limitation comes from the 32 bit offsets used in original DWARF standard. We have the option to switch to 64bit offset for your libs which free us from 4G size boundary. add -gdwarf64 and -Og to avoid this limit. --- CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index f49376d139..19c036e1a5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -98,6 +98,12 @@ add_compile_options(-Wno-pass-failed) add_compile_options(-Wno-switch-default) add_compile_options(-Wno-unique-object-duplication) +# add -Og -gdwarf64 for debug builds +add_compile_options( + "$<$:-Og>" + "$<$:-gdwarf64>" +) + # Recent change in compiler makes this warning ON by default, which led to compile errors. add_compile_options(-Wno-nrvo) From fb96b49666ddd4d7ccfd3528b1859796657e1a6b Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Mon, 4 Aug 2025 11:43:47 -0700 Subject: [PATCH 04/21] fix test_mx_mfma errors (#2614) --- test/mx_mfma_op/mx_mfma_op.hpp | 46 +++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/test/mx_mfma_op/mx_mfma_op.hpp b/test/mx_mfma_op/mx_mfma_op.hpp index 4bb38a0c16..b2e615b9d8 100644 --- a/test/mx_mfma_op/mx_mfma_op.hpp +++ b/test/mx_mfma_op/mx_mfma_op.hpp @@ -187,11 +187,11 @@ __device__ AFragT load_A_col_major(AType const* input_ptr) auto kMinorOffset = col_major(minorStepCoord2D, BLOCK_M); auto kMajorOffset = col_major(majorStepCoord2D, BLOCK_M); - using ARawT = typename scalar_type::type; - using AScalarFragT = - vector_type, ck::f4x2_pk_t> ? 2 : 1)>::type; + using ARawT = typename scalar_type::type; + using AScalarFragT = typename vector_type< + ARawT, + BLOCK_M * BLOCK_K / WAVE_SIZE / + (ck::is_same_v, ck::f4x2_pk_t> ? 2 : 1)>::type; AScalarFragT fragA{}; @@ -319,8 +319,9 @@ __device__ AFragT load_A_row_major(AType const* input_ptr) // Flatten to 1D row_major offsets. auto row_major = [](auto const& coord, auto ld) { return coord.first * ld + coord.second; }; - using ARawT = typename scalar_type::type; - using AScalarChunkT = vector_type::vector_size / num_chunks>::type; + using ARawT = typename scalar_type::type; + using AScalarChunkT = + typename vector_type::vector_size / num_chunks>::type; union { @@ -544,8 +545,9 @@ __device__ BFragT load_B_col_major(BType const* input_ptr) auto majorStepCoord2D = std::make_pair(chunk_offset, 0); // read a chunk from a col - using BRawT = typename scalar_type::type; - using BScalarChunkT = vector_type::vector_size / num_chunks>::type; + using BRawT = typename scalar_type::type; + using BScalarChunkT = + typename vector_type::vector_size / num_chunks>::type; union { @@ -780,7 +782,7 @@ struct store_C_col_major // we can vector store 4 contiguous elements at a time. using CRawT = typename scalar_type::type; - using CScalarFragT = vector_type::type; + using CScalarFragT = typename vector_type::type; union { CFragT frag; @@ -940,12 +942,14 @@ __global__ void matmul(const packed_type_t* a, const packed_type_t assert(threadIdx.x < WAVE_SIZE); assert(blockDim.x == 1 && blockDim.y == 1 && blockDim.z == 1); - using AFragT = vector_type::type; - using BFragT = vector_type::type; + using AFragT = + typename vector_type::type; + using BFragT = + typename vector_type::type; - using CFragT = vector_type::type; + using CFragT = typename vector_type::type; using AccumFragT = vector_type; - using RawAccumFragT = vector_type::type; + using RawAccumFragT = typename vector_type::type; // Create frags auto fragA = AFragT{}; @@ -1019,14 +1023,16 @@ __global__ void matmul(const packed_type_t* a, assert(threadIdx.x < WAVE_SIZE); assert(blockDim.x == 1 && blockDim.y == 1 && blockDim.z == 1); - using AFragT = vector_type::type; - using BFragT = vector_type::type; + using AFragT = + typename vector_type::type; + using BFragT = + typename vector_type::type; - using CFragT = vector_type::type; + using CFragT = typename vector_type::type; using AccumFragT = vector_type; - using RawAccumFragT = vector_type::type; - using AScaleFragT = vector_type::type; - using BScaleFragT = vector_type::type; + using RawAccumFragT = typename vector_type::type; + using AScaleFragT = typename vector_type::type; + using BScaleFragT = typename vector_type::type; // Create frags auto fragA = AFragT{}; From 2a78da47082edbff25b5cf2c5b43eeea673f1485 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Mon, 4 Aug 2025 17:43:15 -0700 Subject: [PATCH 05/21] fix build for test_ck_tile_fp8 on rhel8 (#2615) --- test/ck_tile/data_type/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/test/ck_tile/data_type/CMakeLists.txt b/test/ck_tile/data_type/CMakeLists.txt index a9ce48d1de..a9461dca9c 100644 --- a/test/ck_tile/data_type/CMakeLists.txt +++ b/test/ck_tile/data_type/CMakeLists.txt @@ -8,6 +8,7 @@ endif() if(CK_USE_OCP_FP8 OR CK_USE_FNUZ_FP8) add_gtest_executable(test_ck_tile_fp8 test_fp8.cpp) target_compile_options(test_ck_tile_fp8 PRIVATE -Wno-float-equal) + target_compile_definitions(test_ck_tile_fp8 PUBLIC GTEST_HAS_RTTI=0) # conditionally specify the use of OCP_FP8 if(CK_USE_OCP_FP8) target_compile_options(test_ck_tile_fp8 PRIVATE -DCK_TILE_USE_OCP_FP8) From cbfecf8d7aa50ae64c26f5aba6fef9f2eaab743e Mon Sep 17 00:00:00 2001 From: Thomas Ning Date: Mon, 4 Aug 2025 23:43:01 -0700 Subject: [PATCH 06/21] Persistent grouped gemm CompV4 Enablement & Polish (#2605) * enable the persistent kernel for CompV4 * polish the example and clang format * fix the non-persistent kernel error --------- Co-authored-by: ThomasNing --- .../ck_tile/17_grouped_gemm/CMakeLists.txt | 1 - .../ck_tile/17_grouped_gemm/grouped_gemm.cpp | 122 ++++-------- .../ck_tile/17_grouped_gemm/grouped_gemm.hpp | 2 +- .../17_grouped_gemm/grouped_gemm_tileloop.cpp | 176 ------------------ .../ops/gemm/kernel/grouped_gemm_kernel.hpp | 130 ++++++++++--- .../gemm_pipeline_ag_bg_cr_comp_v4.hpp | 6 +- 6 files changed, 148 insertions(+), 289 deletions(-) delete mode 100644 example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp diff --git a/example/ck_tile/17_grouped_gemm/CMakeLists.txt b/example/ck_tile/17_grouped_gemm/CMakeLists.txt index 79df4e624d..475c13166d 100644 --- a/example/ck_tile/17_grouped_gemm/CMakeLists.txt +++ b/example/ck_tile/17_grouped_gemm/CMakeLists.txt @@ -1,2 +1 @@ add_executable(tile_example_grouped_gemm EXCLUDE_FROM_ALL grouped_gemm.cpp) -add_executable(tile_example_grouped_gemm_tileloop EXCLUDE_FROM_ALL grouped_gemm_tileloop.cpp) diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp index bb0a0d5840..897952f03c 100644 --- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp +++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -16,19 +16,11 @@ #include "ck_tile/host.hpp" #include "grouped_gemm.hpp" -template -float grouped_gemm(const std::vector& gemm_descs, - const ck_tile::stream_config& s, - void* kargs_ptr) +template +float grouped_gemm_tileloop(const ck_tile::stream_config& s, + const ck_tile::index_t num_groups, + void* kargs_ptr, + bool splitk) { #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY) // Memory friendly for Interwave scheduler @@ -83,8 +75,6 @@ float grouped_gemm(const std::vector& gemm_descs, constexpr bool kPadN = false; constexpr bool kPadK = false; - constexpr bool TransposeC = false; - constexpr int kBlockPerCu = 1; constexpr ck_tile::index_t TileParitionerGroupNum = 8; constexpr ck_tile::index_t TileParitionerM01 = 4; @@ -97,54 +87,41 @@ float grouped_gemm(const std::vector& gemm_descs, GemmSpatiallyLocalTilePartitioner; using Traits = ck_tile::TileGemmTraits; - using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits; + using GemmUniversalTraits = ck_tile::PersistentTileGemmUniversalTraits; using GemmPipelineProblem = ck_tile::GemmPipelineProblem; - using BaseGemmPipeline = UNIVERSAL_GEMM_PIPELINE; - - const ck_tile::index_t k_grain = gemm_descs[0].k_batch * K_Tile; - const ck_tile::index_t K_split = (gemm_descs[0].K + k_grain - 1) / k_grain * K_Tile; - const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split); - const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop); - const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop); - float ave_time{0}; - const auto Run = [&](const auto has_hot_loop_, - const auto tail_number_, - const auto memory_operation_) { - constexpr bool has_hot_loop_v = has_hot_loop_.value; - constexpr auto tail_number_v = tail_number_.value; + const auto Run = [&](const auto memory_operation_) { constexpr auto scheduler = GEMM_PIPELINE_SCHEDULER; constexpr auto memory_operation = memory_operation_.value; + // We create the GEMM pipeline without specifying hotloop or tailnumber. + // These are automatically run inside the kernel based on the given input data. using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem; + scheduler>; using GemmPipeline = GEMM_PIPELINE; using GemmEpilogue = ck_tile::CShuffleEpilogue< ck_tile::CShuffleEpilogueProblem, AccDataType, CDataType, - DsLayout, + ck_tile::tuple<>, CLayout, - CDEElementWise, + ck_tile::element_wise::PassThrough, GemmPipelineProblem::kBlockSize, TilePartitioner::MPerBlock, TilePartitioner::NPerBlock, @@ -156,20 +133,8 @@ float grouped_gemm(const std::vector& gemm_descs, UniversalGemmProblem::TransposeC, memory_operation>>; using Kernel = ck_tile::GroupedGemmKernel; - auto kargs = Kernel::MakeKargs(gemm_descs); - if(!Kernel::IsSupportedArgument(kargs)) - { - throw std::runtime_error("Kernel arguments not supported!"); - } - constexpr dim3 blocks = Kernel::BlockSize(); - const dim3 grids = Kernel::GridSize(gemm_descs); - - HIP_CHECK_ERROR(hipMemcpyWithStream(kargs_ptr, - kargs.data(), - get_workspace_size(gemm_descs), - hipMemcpyHostToDevice, - s.stream_id_)); + const dim3 grids = Kernel::MaxOccupancyGridSize(s); if(s.log_level_ > 0) { @@ -186,45 +151,26 @@ float grouped_gemm(const std::vector& gemm_descs, blocks, 0, ck_tile::cast_pointer_to_constant_address_space(kargs_ptr), - gemm_descs.size())); + num_groups)); return ave_time; }; - const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) { - if(gemm_descs[0].k_batch == 1) - { - Run(has_hot_loop_, - tail_number_, - ck_tile::integral_constant{}); - } - else - { - Run(has_hot_loop_, - tail_number_, - ck_tile::integral_constant{}); - } - }; - - BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num); + if(!splitk) + { + Run(ck_tile::integral_constant{}); + } + else + { + Run(ck_tile::integral_constant{}); + } return ave_time; } #include "run_grouped_gemm_example.inc" -constexpr bool Persistent = false; -int main(int argc, char* argv[]) -{ - try - { - return !run_grouped_gemm_example(argc, argv); - } - catch(const std::runtime_error& e) - { - std::cerr << "Runtime error: " << e.what() << '\n'; - return EXIT_FAILURE; - } -} +constexpr bool Persistent = true; +int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); } diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp index 74efb1bdeb..89d91fbef6 100644 --- a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp +++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp @@ -15,7 +15,7 @@ #define CK_TILE_PIPELINE_COMPUTE_V4 3 #ifndef CK_TILE_PIPELINE_DEFAULT -#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V3 +#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V4 #endif #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY) diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp deleted file mode 100644 index 897952f03c..0000000000 --- a/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp +++ /dev/null @@ -1,176 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include -#include -#include -#include -#include -#include - -#include "ck_tile/core.hpp" -#include "ck_tile/ops/epilogue.hpp" -#include "ck_tile/ops/gemm.hpp" -#include "ck_tile/host.hpp" -#include "grouped_gemm.hpp" - -template -float grouped_gemm_tileloop(const ck_tile::stream_config& s, - const ck_tile::index_t num_groups, - void* kargs_ptr, - bool splitk) -{ -#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY) - // Memory friendly for Interwave scheduler - constexpr ck_tile::index_t M_Tile = 128; - constexpr ck_tile::index_t N_Tile = 32; - constexpr ck_tile::index_t K_Tile = 64; - - constexpr ck_tile::index_t M_Warp = 4; - constexpr ck_tile::index_t N_Warp = 1; - constexpr ck_tile::index_t K_Warp = 1; - - constexpr ck_tile::index_t M_Warp_Tile = 32; - constexpr ck_tile::index_t N_Warp_Tile = 32; - constexpr ck_tile::index_t K_Warp_Tile = 8; - - constexpr bool DoubleSmemBuffer = false; -#endif -#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3) - // Compute friendly for Intrawave scheduler - constexpr ck_tile::index_t M_Tile = 256; - constexpr ck_tile::index_t N_Tile = 256; - constexpr ck_tile::index_t K_Tile = 64; - - constexpr ck_tile::index_t M_Warp = 2; - constexpr ck_tile::index_t N_Warp = 2; - constexpr ck_tile::index_t K_Warp = 1; - - constexpr ck_tile::index_t M_Warp_Tile = 32; - constexpr ck_tile::index_t N_Warp_Tile = 32; - constexpr ck_tile::index_t K_Warp_Tile = 16; - - constexpr bool DoubleSmemBuffer = false; -#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4) - // Compute friendly for Intrawave scheduler - // Using the ping pong reader in the lds level - constexpr ck_tile::index_t M_Tile = 256; - constexpr ck_tile::index_t N_Tile = 256; - constexpr ck_tile::index_t K_Tile = 32; - - constexpr ck_tile::index_t M_Warp = 2; - constexpr ck_tile::index_t N_Warp = 2; - constexpr ck_tile::index_t K_Warp = 1; - - constexpr ck_tile::index_t M_Warp_Tile = 32; - constexpr ck_tile::index_t N_Warp_Tile = 32; - constexpr ck_tile::index_t K_Warp_Tile = 16; - - constexpr bool DoubleSmemBuffer = true; -#endif - - constexpr bool kPadM = false; - constexpr bool kPadN = false; - constexpr bool kPadK = false; - - constexpr int kBlockPerCu = 1; - constexpr ck_tile::index_t TileParitionerGroupNum = 8; - constexpr ck_tile::index_t TileParitionerM01 = 4; - - using GemmShape = - ck_tile::TileGemmShape, - ck_tile::sequence, - ck_tile::sequence>; - using TilePartitioner = ck_tile:: - GemmSpatiallyLocalTilePartitioner; - - using Traits = ck_tile::TileGemmTraits; - using GemmUniversalTraits = ck_tile::PersistentTileGemmUniversalTraits; - using GemmPipelineProblem = - ck_tile::GemmPipelineProblem; - - float ave_time{0}; - - const auto Run = [&](const auto memory_operation_) { - constexpr auto scheduler = GEMM_PIPELINE_SCHEDULER; - constexpr auto memory_operation = memory_operation_.value; - - // We create the GEMM pipeline without specifying hotloop or tailnumber. - // These are automatically run inside the kernel based on the given input data. - using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem; - - using GemmPipeline = GEMM_PIPELINE; - using GemmEpilogue = ck_tile::CShuffleEpilogue< - ck_tile::CShuffleEpilogueProblem, - AccDataType, - CDataType, - ck_tile::tuple<>, - CLayout, - ck_tile::element_wise::PassThrough, - GemmPipelineProblem::kBlockSize, - TilePartitioner::MPerBlock, - TilePartitioner::NPerBlock, - M_Warp, - N_Warp, - M_Warp_Tile, - N_Warp_Tile, - K_Warp_Tile, - UniversalGemmProblem::TransposeC, - memory_operation>>; - using Kernel = ck_tile::GroupedGemmKernel; - constexpr dim3 blocks = Kernel::BlockSize(); - const dim3 grids = Kernel::MaxOccupancyGridSize(s); - - if(s.log_level_ > 0) - { - std::cout << "Launching kernel: " << Kernel::GetName() << " with args:" << " grid: {" - << grids.x << ", " << grids.y << ", " << grids.z << "}" << ", blocks: {" - << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" << std::endl; - } - - ave_time = - ck_tile::launch_kernel(s, - ck_tile::make_kernel( - Kernel{}, - grids, - blocks, - 0, - ck_tile::cast_pointer_to_constant_address_space(kargs_ptr), - num_groups)); - - return ave_time; - }; - - if(!splitk) - { - Run(ck_tile::integral_constant{}); - } - else - { - Run(ck_tile::integral_constant{}); - } - - return ave_time; -} - -#include "run_grouped_gemm_example.inc" - -constexpr bool Persistent = true; -int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); } diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp index 921ea11720..477a87d42f 100644 --- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp @@ -252,13 +252,6 @@ struct GroupedGemmKernel return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize()); } - CK_TILE_DEVICE void Run(const GemmTransKernelArg& kargs, - const tuple& block_idx_2d, - const index_t block_idx_z) const - { - Run(kargs.group_karg, block_idx_2d, block_idx_z); - } - CK_TILE_DEVICE void Run(const UniversalGemmKernelArgs<>& kargs, const tuple& block_idx_2d, const index_t block_idx_z) const @@ -277,24 +270,56 @@ struct GroupedGemmKernel CDataType* c_ptr = static_cast(kargs.e_ptr); // allocate LDS - __shared__ char smem_ptr[GetSmemSize()]; + __shared__ char smem_ptr_0[GetSmemSize()]; - if constexpr(UsePersistentKernel) + if constexpr(GemmPipeline::DoubleSmemBuffer == true) { - RunGemmWithPipelineSelection( - a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n); + __shared__ char smem_ptr_1[GetSmemSize()]; + if constexpr(UsePersistentKernel) + { + RunGemmWithPipelineSelection2LDS(a_ptr, + b_ptr, + c_ptr, + smem_ptr_0, + smem_ptr_1, + kargs, + splitk_batch_offset, + i_m, + i_n); + } + else + { + Base::RunGemm2LDS({a_ptr}, + {b_ptr}, + {/*ds_ptr*/}, + c_ptr, + smem_ptr_0, + smem_ptr_1, + kargs, + splitk_batch_offset, + i_m, + i_n); + } } else { - Base::RunGemm({a_ptr}, - {b_ptr}, - {/*ds_ptr*/}, - c_ptr, - smem_ptr, - kargs, - splitk_batch_offset, - i_m, - i_n); + if constexpr(UsePersistentKernel) + { + RunGemmWithPipelineSelection( + a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n); + } + else + { + Base::RunGemm({a_ptr}, + {b_ptr}, + {/*ds_ptr*/}, + c_ptr, + smem_ptr_0, + kargs, + splitk_batch_offset, + i_m, + i_n); + } } } @@ -358,6 +383,69 @@ struct GroupedGemmKernel c_block_window, c_block_tile, d_block_window, smem_ptr_0); } + /** + * @brief Runs single GEMM problem cooperatively by whole workgroup. + * + * @note The GEMM pipeline is selected in-kernel based on the number of K-loops + * and the tail-number. This is needed for the persistent tile-loop when + * we didn't have access to the K dimension on the host. + * + * @param a_ptr input A pointer + * @param b_ptr input B pointer + * @param c_ptr output C pointer + * @param smem_ptr_0 The start memory pointer of the shared memory block. + * @param smem_ptr_1 The second start memory pointer of the shared memory block. + * @param kargs GEMM kernel arguments + * @param splitk_batch_offset splitk_batch_offset Utility structure used to calculate k + * batch. + * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup. + * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup. + * + */ + CK_TILE_DEVICE static void + RunGemmWithPipelineSelection2LDS(const ADataType* a_ptr, + const BDataType* b_ptr, + CDataType* c_ptr, + void* __restrict__ smem_ptr_0, + void* __restrict__ smem_ptr_1, + const UniversalGemmKernelArgs<>& kargs, + const typename Base::SplitKBatchOffset& splitk_batch_offset, + const index_t block_idx_m, + const index_t block_idx_n) + { + // Create Gemm tensor views, pad views and tile windows + const auto& gemm_tensor_views_tuple = + Base::template MakeGemmTensorViews( + {a_ptr}, {b_ptr}, {/*ds_ptr*/}, c_ptr, kargs, splitk_batch_offset); + + const auto& gemm_pad_views = Base::MakeGemmPadViews(gemm_tensor_views_tuple); + auto gemm_tile_windows = + Base::MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n); + const auto& a_block_window = gemm_tile_windows.at(Base::I0); + const auto& b_block_window = gemm_tile_windows.at(Base::I1); + const auto& d_block_window = gemm_tile_windows.at(Base::I2); + + // Get hot-loop and tail configuration + const index_t num_loop = __builtin_amdgcn_readfirstlane( + TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); + const bool has_hot_loop = GemmPipeline::BlockHasHotloop(num_loop); + const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop); + + // Run GEMM pipeline + const auto& c_block_tile = GemmPipeline{}.template operator()(a_block_window[Base::I0], + b_block_window[Base::I0], + num_loop, + has_hot_loop, + tail_num, + smem_ptr_0, + smem_ptr_1); + // Run Epilogue Pipeline + auto& c_block_window = gemm_tile_windows.at(Base::I3); + EpiloguePipeline{}.template + operator()( + c_block_window, c_block_tile, d_block_window, smem_ptr_0); + } + CK_TILE_DEVICE index_t FindGroupId(const GemmTransKernelArg* gemm_desc_ptr, index_t block_id, index_t group_count) const @@ -401,7 +489,7 @@ struct GroupedGemmKernel kargs.group_karg.M, kargs.group_karg.N, (block_id - kargs.block_start) % grid_size_2d); - Run(kargs, block_idx_2d, (block_id - kargs.block_start) / grid_size_2d); + Run(kargs.group_karg, block_idx_2d, (block_id - kargs.block_start) / grid_size_2d); } // For persistent kernels diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp index ac91c2f58f..22c8cf383b 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp @@ -18,12 +18,14 @@ struct BaseGemmPipelineAgBgCrCompV4 static constexpr index_t PrefillStages = 1; static constexpr index_t GlobalBufferNum = 1; - CK_TILE_HOST static constexpr bool BlockHasHotloop(index_t num_loop) + static constexpr bool UsePersistentKernel = Problem::Traits::UsePersistentKernel; + + CK_TILE_HOST_DEVICE static constexpr bool BlockHasHotloop(index_t num_loop) { return num_loop > PrefetchStages; } - CK_TILE_HOST static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop) + CK_TILE_HOST_DEVICE static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop) { if(num_loop % PrefetchStages == 1) { From 2203b0ddfe06f4f9f5126e54e78697dfb16118d4 Mon Sep 17 00:00:00 2001 From: Enrico Degregori <73224202+EnricoDeg@users.noreply.github.com> Date: Tue, 5 Aug 2025 15:23:19 +0200 Subject: [PATCH 07/21] Add padding to 1x1Stride1Pad0 conv specialization (grouped conv bwd weight) (#2610) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add padding 1x1Stride1Pad0 conv specialization * Add gridwise checks for conv cshufflev3 * Merge padding with previous transforms * Apply transform changes for padding to default specialization as well --------- Co-authored-by: Bartłomiej Kocot --- include/ck/ck.hpp | 3 - ...rouped_conv_bwd_weight_xdl_cshuffle_v3.hpp | 11 +- .../gridwise_gemm_xdl_cshuffle_conv_v3.hpp | 198 ++++++++++++++++++ .../transform_conv_bwd_weight_to_gemm.hpp | 126 ++++------- .../transform_conv_bwd_weight_to_gemm_v2.hpp | 120 ++++------- 5 files changed, 290 insertions(+), 168 deletions(-) diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp index 794c6f4e20..09801203ba 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -222,9 +222,6 @@ // TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread" #define CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0 -// workaround: conv crash when K, C is even -#define CK_WORKAROUND_DISABLE_FILTER1x1STRIDE1PAD0_WHEN_K_C_IS_EVEN 1 - // workaround: compiler crash when compiling recursive lambda #define CK_WORKAROUND_SWDEV_275126 1 diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp index 1cd1f16245..ed64b83356 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp @@ -331,9 +331,9 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3 using CGridDesc_M_N = remove_cvref_t; using GridwiseGemm = GridwiseGemm_xdl_cshuffle_conv_v3< - tensor_layout::gemm::RowMajor, tensor_layout::gemm::ColumnMajor, tensor_layout::gemm::RowMajor, + tensor_layout::gemm::RowMajor, ADataType, BDataType, AccDataType, @@ -1299,13 +1299,6 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3 if constexpr(ConvBackwardWeightSpecialization == ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0) { -// workaround: disable when K, C is even -#if CK_WORKAROUND_DISABLE_FILTER1x1STRIDE1PAD0_WHEN_K_C_IS_EVEN - if(arg.Conv_C_ % 2 == 0 || arg.Conv_K_ % 2 == 0) - { - return false; - } -#endif // check if it's 1x1, stride=1 pad = 0 conv for(int i = 0; i < NDimSpatial; i++) { @@ -1330,7 +1323,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3 } // Gridwise GEMM size - return true; + return GridwiseGemm::CheckValidity(gemm_arg); } bool IsSupportedArgument(const BaseArgument* p_arg) override diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp index 68112489ca..382d2870e8 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp @@ -4,6 +4,7 @@ #pragma once #include "ck/utility/common_header.hpp" +#include "ck/utility/env.hpp" #include "ck/tensor_description/multi_index_transform_helper.hpp" #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" @@ -606,6 +607,203 @@ struct GridwiseGemm_xdl_cshuffle_conv_v3 c_block_size * sizeof(CShuffleDataType)); } + // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} + __host__ static constexpr bool CheckValidity(const Argument& karg) + { + static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) && + (NPerBlock % (NXdlPerWave * NPerXdl)) == 0, + "Invalid tuning param!"); + + if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding || + GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding || + GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding || + GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) && + !(is_same::value)) + { + if(!(karg.M % MPerBlock == 0)) + { + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) + { + std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " " + << __FILE__ << ":" << __LINE__ << ", in function: " << __func__ + << std::endl; + } + return false; + } + } + + if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding || + GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding || + GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding || + GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) && + (is_same::value)) + { + if(!(karg.N % NPerBlock == 0)) + { + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) + { + std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " " + << __FILE__ << ":" << __LINE__ << ", in function: " << __func__ + << std::endl; + } + return false; + } + } + + if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::KPadding || + GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding || + GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding || + GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding)) + { + + auto K_t = karg.KBatch * KPerBlock; + if(!(karg.K % K_t == 0)) + { + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) + { + std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: " + << karg.K << " " << __FILE__ << ":" << __LINE__ + << ", in function: " << __func__ << std::endl; + } + return false; + } + } + else + { + constexpr auto KReadVec = math::lcm(AK1Number, BK1Number); + auto K_t = karg.KBatch * KReadVec; + auto KReadPadSplited = math::integer_divide_ceil(karg.K, K_t) * KReadVec; + if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K) + { + return false; + } + } + + if constexpr(is_same::value) + { + if(karg.K % ABlockTransferSrcScalarPerVector != 0) + { + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) + { + std::cout << "Arg K (" << karg.K + << ") value is not a multiple of ABlockTransferSrcScalarPerVector (" + << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":" + << __LINE__ << ", in function: " << __func__ << std::endl; + } + return false; + } + } + else + { + if(karg.M % ABlockTransferSrcScalarPerVector != 0) + { + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) + { + std::cout << "Arg M (" << karg.M + << ") value is not a multiple of ABlockTransferSrcScalarPerVector (" + << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":" + << __LINE__ << ", in function: " << __func__ << std::endl; + } + return false; + } + } + + if constexpr(is_same::value) + { + if(karg.N % BBlockTransferSrcScalarPerVector != 0) + { + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) + { + std::cout << "Arg N (" << karg.N + << ") value is not a multiple of BBlockTransferSrcScalarPerVector (" + << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":" + << __LINE__ << ", in function: " << __func__ << std::endl; + } + return false; + } + } + else + { + if(karg.K % BBlockTransferSrcScalarPerVector != 0) + { + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) + { + std::cout << "Arg K (" << karg.K + << ") value is not a multiple of BBlockTransferSrcScalarPerVector (" + << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":" + << __LINE__ << ", in function: " << __func__ << std::endl; + } + return false; + } + } + + if constexpr(is_same::value) + { + if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0) + { + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) + { + std::cout << "Arg N (" << karg.N + << ") value is not a multiple of " + "CShuffleBlockTransferScalarPerVector_NPerBlock (" + << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! " + << __FILE__ << ":" << __LINE__ << ", in function: " << __func__ + << std::endl; + } + return false; + } + } + else + { + if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0) + { + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) + { + std::cout << "Arg M (" << karg.M + << ") value is not a multiple of " + "CShuffleBlockTransferScalarPerVector_NPerBlock (" + << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! " + << __FILE__ << ":" << __LINE__ << ", in function: " << __func__ + << std::endl; + } + return false; + } + } + + if constexpr(!(is_same, half_t>::value || + is_same, float>::value || + is_same, bhalf_t>::value || + is_same, int32_t>::value)) + { + if(!karg.IsReduceAdd()) + { + if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) + { + std::cout << " KBatch: " << karg.KBatch << " > 1 is not support yet" << __FILE__ + << ":" << __LINE__ << ", in function: " << __func__ << std::endl; + } + if(karg.KBatch > 1) + { + return false; + } + } + } + + // check gridwise gemm pipeline + const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value); + + if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1) + { + if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages) + { + return false; + } + } + + // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc) + return true; + } + __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K) { const index_t num_loop = K / KPerBlock; diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp index bd3ab10802..efc7f20cdc 100644 --- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp +++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp @@ -192,7 +192,7 @@ struct TransformConvBwdWeightToGemm const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_pass_through_transform(GemmM)), + make_right_pad_transform(GemmM, PadGemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); @@ -210,7 +210,7 @@ struct TransformConvBwdWeightToGemm const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_pass_through_transform(GemmN)), + make_right_pad_transform(GemmN, PadGemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); @@ -218,9 +218,17 @@ struct TransformConvBwdWeightToGemm const auto wei_gemmm_gemmn_grid_desc = make_naive_tensor_descriptor_packed(make_tuple(K, X * C)); + // Padd + const auto wei_gemmm_gemmn_pad_grid_desc = + transform_tensor_descriptor(wei_gemmm_gemmn_grid_desc, + make_tuple(make_right_pad_transform(GemmM, PadGemmM), + make_right_pad_transform(GemmN, PadGemmN)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, - wei_gemmm_gemmn_grid_desc); + wei_gemmm_gemmn_pad_grid_desc); } else { @@ -240,7 +248,7 @@ struct TransformConvBwdWeightToGemm const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_pass_through_transform(GemmM)), + make_right_pad_transform(GemmM, PadGemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); @@ -279,7 +287,7 @@ struct TransformConvBwdWeightToGemm const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_pass_through_transform(GemmN)), + make_right_pad_transform(GemmN, PadGemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); @@ -288,26 +296,6 @@ struct TransformConvBwdWeightToGemm make_naive_tensor_descriptor_packed(make_tuple(K, X * C)); // Padd - const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc = - transform_tensor_descriptor( - out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, - make_tuple(make_pass_through_transform(GemmKBatch), - make_pass_through_transform(GemmK0), - make_right_pad_transform(GemmM, PadGemmM), - make_pass_through_transform(GemmK1Number)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - - const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc = - transform_tensor_descriptor( - in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, - make_tuple(make_pass_through_transform(GemmKBatch), - make_pass_through_transform(GemmK0), - make_right_pad_transform(GemmN, PadGemmN), - make_pass_through_transform(GemmK1Number)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - const auto wei_gemmm_gemmn_pad_grid_desc = transform_tensor_descriptor(wei_gemmm_gemmn_grid_desc, make_tuple(make_right_pad_transform(GemmM, PadGemmM), @@ -315,8 +303,8 @@ struct TransformConvBwdWeightToGemm make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc, - in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc, + return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, + in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, wei_gemmm_gemmn_pad_grid_desc); } } @@ -392,7 +380,7 @@ struct TransformConvBwdWeightToGemm const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_pass_through_transform(GemmM)), + make_right_pad_transform(GemmM, PadGemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); @@ -407,13 +395,21 @@ struct TransformConvBwdWeightToGemm const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_pass_through_transform(GemmN)), + make_right_pad_transform(GemmN, PadGemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); + // Padd + const auto wei_gemmm_gemmn_pad_grid_desc = + transform_tensor_descriptor(wei_grid_desc, + make_tuple(make_right_pad_transform(GemmM, PadGemmM), + make_right_pad_transform(GemmN, PadGemmN)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, - wei_grid_desc); + wei_gemmm_gemmn_pad_grid_desc); } else { @@ -428,7 +424,7 @@ struct TransformConvBwdWeightToGemm const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_pass_through_transform(GemmM)), + make_right_pad_transform(GemmM, PadGemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); @@ -469,31 +465,11 @@ struct TransformConvBwdWeightToGemm const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_pass_through_transform(GemmN)), + make_right_pad_transform(GemmN, PadGemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); // Padd - const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc = - transform_tensor_descriptor( - out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, - make_tuple(make_pass_through_transform(GemmKBatch), - make_pass_through_transform(GemmK0), - make_right_pad_transform(GemmM, PadGemmM), - make_pass_through_transform(GemmK1Number)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - - const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc = - transform_tensor_descriptor( - in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, - make_tuple(make_pass_through_transform(GemmKBatch), - make_pass_through_transform(GemmK0), - make_right_pad_transform(GemmN, PadGemmN), - make_pass_through_transform(GemmK1Number)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - const auto wei_gemmm_gemmn_pad_grid_desc = transform_tensor_descriptor(wei_grid_desc, make_tuple(make_right_pad_transform(GemmM, PadGemmM), @@ -501,8 +477,8 @@ struct TransformConvBwdWeightToGemm make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc, - in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc, + return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, + in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, wei_gemmm_gemmn_pad_grid_desc); } } @@ -585,7 +561,7 @@ struct TransformConvBwdWeightToGemm const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_pass_through_transform(GemmM)), + make_right_pad_transform(GemmM, PadGemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); @@ -600,13 +576,21 @@ struct TransformConvBwdWeightToGemm const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_pass_through_transform(GemmN)), + make_right_pad_transform(GemmN, PadGemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); + // Padd + const auto wei_gemmm_gemmn_pad_grid_desc = + transform_tensor_descriptor(wei_grid_desc, + make_tuple(make_right_pad_transform(GemmM, PadGemmM), + make_right_pad_transform(GemmN, PadGemmN)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, - wei_grid_desc); + wei_gemmm_gemmn_pad_grid_desc); } else { @@ -621,7 +605,7 @@ struct TransformConvBwdWeightToGemm const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_pass_through_transform(GemmM)), + make_right_pad_transform(GemmM, PadGemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); @@ -671,31 +655,11 @@ struct TransformConvBwdWeightToGemm const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_pass_through_transform(GemmN)), + make_right_pad_transform(GemmN, PadGemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); // Padd - const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc = - transform_tensor_descriptor( - out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, - make_tuple(make_pass_through_transform(GemmKBatch), - make_pass_through_transform(GemmK0), - make_right_pad_transform(GemmM, PadGemmM), - make_pass_through_transform(GemmK1Number)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - - const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc = - transform_tensor_descriptor( - in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, - make_tuple(make_pass_through_transform(GemmKBatch), - make_pass_through_transform(GemmK0), - make_right_pad_transform(GemmN, PadGemmN), - make_pass_through_transform(GemmK1Number)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - const auto wei_gemmm_gemmn_pad_grid_desc = transform_tensor_descriptor(wei_grid_desc, make_tuple(make_right_pad_transform(GemmM, PadGemmM), @@ -703,8 +667,8 @@ struct TransformConvBwdWeightToGemm make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc, - in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc, + return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, + in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, wei_gemmm_gemmn_pad_grid_desc); } } // function end diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp index b72ddb8243..e410f06190 100644 --- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp +++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp @@ -374,7 +374,7 @@ struct TransformConvBwdWeightToGemmV2 const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_pass_through_transform(GemmM)), + make_right_pad_transform(GemmM, PadGemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); @@ -390,13 +390,21 @@ struct TransformConvBwdWeightToGemmV2 const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_pass_through_transform(GemmN)), + make_right_pad_transform(GemmN, PadGemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + // Padd + const auto wei_gemmm_gemmn_pad_grid_desc = + transform_tensor_descriptor(wei_grid_desc, + make_tuple(make_right_pad_transform(GemmM, PadGemmM), + make_right_pad_transform(GemmN, PadGemmN)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, - wei_grid_desc); + wei_gemmm_gemmn_pad_grid_desc); } else { @@ -412,7 +420,7 @@ struct TransformConvBwdWeightToGemmV2 const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_pass_through_transform(GemmM)), + make_right_pad_transform(GemmM, PadGemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); @@ -453,29 +461,11 @@ struct TransformConvBwdWeightToGemmV2 const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_pass_through_transform(GemmN)), + make_right_pad_transform(GemmN, PadGemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); // Padd - const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc = - transform_tensor_descriptor( - out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, - make_tuple(make_pass_through_transform(GemmKBatch * GemmK0), - make_right_pad_transform(GemmM, PadGemmM), - make_pass_through_transform(GemmK1Number)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); - - const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc = - transform_tensor_descriptor( - in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, - make_tuple(make_pass_through_transform(GemmKBatch * GemmK0), - make_right_pad_transform(GemmN, PadGemmN), - make_pass_through_transform(GemmK1Number)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); - const auto wei_gemmm_gemmn_pad_grid_desc = transform_tensor_descriptor(wei_grid_desc, make_tuple(make_right_pad_transform(GemmM, PadGemmM), @@ -483,8 +473,8 @@ struct TransformConvBwdWeightToGemmV2 make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc, - in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc, + return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, + in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, wei_gemmm_gemmn_pad_grid_desc); } @@ -562,7 +552,7 @@ struct TransformConvBwdWeightToGemmV2 const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_pass_through_transform(GemmM)), + make_right_pad_transform(GemmM, PadGemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); @@ -578,13 +568,21 @@ struct TransformConvBwdWeightToGemmV2 const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_pass_through_transform(GemmN)), + make_right_pad_transform(GemmN, PadGemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + // Padd + const auto wei_gemmm_gemmn_pad_grid_desc = + transform_tensor_descriptor(wei_grid_desc, + make_tuple(make_right_pad_transform(GemmM, PadGemmM), + make_right_pad_transform(GemmN, PadGemmN)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, - wei_grid_desc); + wei_gemmm_gemmn_pad_grid_desc); } else { @@ -600,7 +598,7 @@ struct TransformConvBwdWeightToGemmV2 const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_pass_through_transform(GemmM)), + make_right_pad_transform(GemmM, PadGemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); @@ -650,29 +648,11 @@ struct TransformConvBwdWeightToGemmV2 const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_pass_through_transform(GemmN)), + make_right_pad_transform(GemmN, PadGemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); // Padd - const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc = - transform_tensor_descriptor( - out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, - make_tuple(make_pass_through_transform(GemmKBatch * GemmK0), - make_right_pad_transform(GemmM, PadGemmM), - make_pass_through_transform(GemmK1Number)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); - - const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc = - transform_tensor_descriptor( - in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, - make_tuple(make_pass_through_transform(GemmKBatch * GemmK0), - make_right_pad_transform(GemmN, PadGemmN), - make_pass_through_transform(GemmK1Number)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); - const auto wei_gemmm_gemmn_pad_grid_desc = transform_tensor_descriptor(wei_grid_desc, make_tuple(make_right_pad_transform(GemmM, PadGemmM), @@ -680,8 +660,8 @@ struct TransformConvBwdWeightToGemmV2 make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc, - in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc, + return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, + in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, wei_gemmm_gemmn_pad_grid_desc); } } @@ -765,7 +745,7 @@ struct TransformConvBwdWeightToGemmV2 const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_pass_through_transform(GemmM)), + make_right_pad_transform(GemmM, PadGemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); @@ -781,13 +761,21 @@ struct TransformConvBwdWeightToGemmV2 const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_pass_through_transform(GemmN)), + make_right_pad_transform(GemmN, PadGemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + // Padd + const auto wei_gemmm_gemmn_pad_grid_desc = + transform_tensor_descriptor(wei_grid_desc, + make_tuple(make_right_pad_transform(GemmM, PadGemmM), + make_right_pad_transform(GemmN, PadGemmN)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, - wei_grid_desc); + wei_gemmm_gemmn_pad_grid_desc); } else { @@ -803,7 +791,7 @@ struct TransformConvBwdWeightToGemmV2 const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_pass_through_transform(GemmM)), + make_right_pad_transform(GemmM, PadGemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); @@ -868,29 +856,11 @@ struct TransformConvBwdWeightToGemmV2 const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_pass_through_transform(GemmN)), + make_right_pad_transform(GemmN, PadGemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); // Padd - const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc = - transform_tensor_descriptor( - out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, - make_tuple(make_pass_through_transform(GemmKBatch * GemmK0), - make_right_pad_transform(GemmM, PadGemmM), - make_pass_through_transform(GemmK1Number)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); - - const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc = - transform_tensor_descriptor( - in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, - make_tuple(make_pass_through_transform(GemmKBatch * GemmK0), - make_right_pad_transform(GemmN, PadGemmN), - make_pass_through_transform(GemmK1Number)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); - const auto wei_gemmm_gemmn_pad_grid_desc = transform_tensor_descriptor(wei_grid_desc, make_tuple(make_right_pad_transform(GemmM, PadGemmM), @@ -898,8 +868,8 @@ struct TransformConvBwdWeightToGemmV2 make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc, - in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc, + return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, + in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, wei_gemmm_gemmn_pad_grid_desc); } } // function end From 833ae1d051d5e9e658afb43a63c73de108ee87d3 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Tue, 5 Aug 2025 09:27:55 -0700 Subject: [PATCH 08/21] Revert "Reduce build time tile engine (#2579)" (#2623) This reverts commit e5b79b26fae87a9e610a805e7feed6eb1e30158c. --- Jenkinsfile | 146 +++++++++++++- tile_engine/ops/gemm/CMakeLists.txt | 287 +++++++++++----------------- 2 files changed, 254 insertions(+), 179 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index b70c28ad39..0363b07d89 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -438,6 +438,34 @@ def cmake_build(Map conf=[:]){ echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing." } } + if (params.RUN_CK_TILE_TRANSPOSE_TESTS){ + try{ + archiveArtifacts "perf_transpose_*.log" + if (arch_type == 1){ + stash includes: "perf_transpose_**_gfx90a.log", name: "perf_transpose_log_gfx90a" + } + else if (arch_type == 2){ + stash includes: "perf_transpose_**_gfx942.log", name: "perf_transpose_log_gfx942" + } + } + catch(Exception err){ + echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing." + } + } + if (params.RUN_CK_TILE_GEMM_TESTS){ + try{ + archiveArtifacts "perf_tile_gemm_**.log" + if (arch == 1){ + stash includes: "perf_tile_gemm_**_gfx90a.log", name: "perf_tile_gemm_log_gfx90a" + } + else if (arch == 2){ + stash includes: "perf_tile_gemm_**_gfx942.log", name: "perf_tile_gemm_log_gfx942" + } + } + catch(Exception err){ + echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing." + } + } } def buildHipClangJob(Map conf=[:]){ @@ -734,6 +762,24 @@ def process_results(Map conf=[:]){ echo "could not locate the FMHA performance logs: ${err.getMessage()}." } } + if (params.RUN_CK_TILE_TRANSPOSE_TESTS){ + try{ + unstash "perf_transpose_log_gfx942" + unstash "perf_transpose_log_gfx90a" + } + catch(Exception err){ + echo "could not locate the Transpose performance logs: ${err.getMessage()}." + } + } + if (params.RUN_CK_TILE_GEMM_TESTS){ + try{ + unstash "perf_tile_gemm_log_gfx942" + unstash "perf_tile_gemm_log_gfx90a" + } + catch(Exception err){ + echo "could not locate the GEMM performance logs: ${err.getMessage()}." + } + } if (params.RUN_FULL_QA || params.BUILD_INSTANCES_ONLY){ // unstash deb packages unstash "packages" @@ -815,7 +861,7 @@ def run_aiter_tests(Map conf=[:]){ } //launch develop branch daily jobs -CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true +CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true 0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true 0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true 0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true @@ -895,6 +941,14 @@ pipeline { name: "RUN_CK_TILE_FMHA_TESTS", defaultValue: false, description: "Run the ck_tile FMHA tests (default: OFF)") + booleanParam( + name: "RUN_CK_TILE_TRANSPOSE_TESTS", + defaultValue: false, + description: "Run the ck_tile Transpose tests (default: OFF)") + booleanParam( + name: "RUN_CK_TILE_GEMM_TESTS", + defaultValue: false, + description: "Run the ck_tile GEMM tests (default: OFF)") booleanParam( name: "RUN_TILE_ENGINE_GEMM_TESTS", defaultValue: false, @@ -1144,6 +1198,94 @@ pipeline { } } } + stage("Run CK_TILE_TRANSPOSE Tests") + { + parallel + { + stage("Run CK_TILE_TRANSPOSE Tests on gfx90a") + { + when { + beforeAgent true + expression { params.RUN_CK_TILE_TRANSPOSE_TESTS.toBoolean() } + } + agent{ label rocmnode("gfx90a") } + environment{ + setup_args = "NO_CK_BUILD" + execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \ + make -j64 tile_example_batched_transpose && \ + cd ../ && + example/ck_tile/35_batched_transpose/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """ + } + steps{ + buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + cleanWs() + } + } + stage("Run CK_TILE_TRANSPOSE Tests on gfx942") + { + when { + beforeAgent true + expression { params.RUN_CK_TILE_TRANSPOSE_TESTS.toBoolean() } + } + agent{ label rocmnode("gfx942") } + environment{ + setup_args = "NO_CK_BUILD" + execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \ + make -j64 tile_example_batched_transpose && \ + cd ../ && + example/ck_tile/35_batched_transpose/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """ + } + steps{ + buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + cleanWs() + } + } + } + } + stage("Run CK_TILE_GEMM Tests") + { + parallel + { + stage("Run CK_TILE_GEMM Tests on gfx90a") + { + when { + beforeAgent true + expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() } + } + agent{ label rocmnode("gfx90a") } + environment{ + setup_args = "NO_CK_BUILD" + execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \ + make -j64 tile_example_gemm_universal && \ + cd ../ && + example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """ + } + steps{ + buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + cleanWs() + } + } + stage("Run CK_TILE_GEMM Tests on gfx942") + { + when { + beforeAgent true + expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() } + } + agent{ label rocmnode("gfx942") } + environment{ + setup_args = "NO_CK_BUILD" + execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \ + make -j64 tile_example_gemm_universal && \ + cd ../ && + example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """ + } + steps{ + buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + cleanWs() + } + } + } + } stage("Run TILE_ENGINE_GEMM Tests") { parallel @@ -1350,7 +1492,7 @@ pipeline { -DGPU_TARGETS="gfx90a" \ -DCMAKE_CXX_COMPILER="${build_compiler()}" \ -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \ - -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j 32""" + -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ } steps{ Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt index d8200ed947..fe9b7802a7 100644 --- a/tile_engine/ops/gemm/CMakeLists.txt +++ b/tile_engine/ops/gemm/CMakeLists.txt @@ -1,215 +1,148 @@ + set(GEMM_DATATYPE "fp8;fp16" CACHE STRING "List of datatypes for GEMM (semicolon-separated)") set(GEMM_LAYOUT "rcr" CACHE STRING "List of layout for GEMM (semicolon-separated)") -# Pre-generate all kernel lists to avoid blocking during parallel builds -foreach(dt IN LISTS GEMM_DATATYPE) - foreach(l IN LISTS GEMM_LAYOUT) - set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${dt}/${l}") - file(MAKE_DIRECTORY "${working_path}") - - if (l STREQUAL "rcr") - set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json") - else() - set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/custom_ci_config.json") - endif() - - # Only run if files don't exist - if (NOT EXISTS "${working_path}/gemm_instance_blobs.txt") - execute_process( - COMMAND ${Python3_EXECUTABLE} "${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py" - --working_path "${working_path}" - --datatype "${dt}" - --layout "${l}" - --config_json "${json_blob}" - --list_blobs - RESULT_VARIABLE ret - ) - if (NOT ret EQUAL 0) - message(FATAL_ERROR "Failed to pre-generate kernel list for ${dt} ${l}") - endif() - endif() - endforeach() -endforeach() - function(build_gemm_for_datatype datatype layout) set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}") - if (layout STREQUAL "rcr") + # Comment this if-else block when using user_provided_config + if(layout STREQUAL "rcr") set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json") else() set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/custom_ci_config.json") endif() - # Uncomment to override: - # set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json") - # Read pre-generated kernel lists + # uncomment this if you want to use user_provided_config.json + # set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json") + + # Generate kernel list + execute_process( + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py + --working_path ${working_path} + --datatype ${datatype} + --layout ${layout} + --config_json ${json_blob} + --list_blobs + RESULT_VARIABLE ret + ) + if(NOT ret EQUAL 0) + message(FATAL_ERROR "Failed to list kernels for ${datatype} ${layout}: ${ret}") + endif() + file(STRINGS "${working_path}/gemm_instance_blobs.txt" codegen_blobs) file(STRINGS "${working_path}/gemm_instance_blobs_range.txt" codegen_blobs_range) - + # Generate the blobs add_custom_command( OUTPUT ${codegen_blobs} - COMMAND ${Python3_EXECUTABLE} "${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py" + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py --working_path "${working_path}" - --datatype "${datatype}" - --layout "${layout}" + --datatype ${datatype} + --layout ${layout} --config_json "${json_blob}" --gen_blobs COMMENT "Generating GEMM instance sources for ${datatype} ${layout}" ) add_custom_target(gemm_gen_${datatype}_${layout} DEPENDS ${codegen_blobs}) - # Parse ranges to identify unique trait names - set(unique_traits) - foreach(range_line IN LISTS codegen_blobs_range) - string(STRIP "${range_line}" stripped_line) - separate_arguments(split_line UNIX_COMMAND "${stripped_line}") - list(GET split_line 0 trait_name) - list(APPEND unique_traits "${trait_name}") - endforeach() - list(REMOVE_DUPLICATES unique_traits) + set(intermediate_libs) + list(LENGTH codegen_blobs codegen_blobs_len) - # Build each trait separately - foreach(trait IN LISTS unique_traits) - set(trait_files) - foreach(range_line IN LISTS codegen_blobs_range) - string(STRIP "${range_line}" stripped_line) - separate_arguments(split_line UNIX_COMMAND "${stripped_line}") - list(GET split_line 0 name) - if (name STREQUAL trait) - list(GET split_line 1 first) - list(GET split_line 2 last) - math(EXPR total_files "${last} - ${first}") - if (total_files GREATER 0) - foreach(j RANGE ${first} ${last}-1) - list(LENGTH codegen_blobs blobs_len) - if (j LESS blobs_len) - list(GET codegen_blobs ${j} f) - list(APPEND trait_files "${f}") - endif() - endforeach() - endif() - endif() - endforeach() + foreach(blob IN LISTS codegen_blobs_range) + string(STRIP "${blob}" stripped_blob) + separate_arguments(spilit_blob UNIX_COMMAND "${stripped_blob}") + # Each line is: + list(GET spilit_blob 0 name) + list(GET spilit_blob 1 first) + list(GET spilit_blob 2 last) + math(EXPR total_files "${last} - ${first}") + if(total_files EQUAL 0) + continue() # nothing for this trait + endif() - if (trait_files) - # Create object libraries with chunking - set(chunk_size 3) # adjust as needed for memory vs parallelism - list(LENGTH trait_files num_files) - math(EXPR num_chunks "( ${num_files} + ${chunk_size} - 1 ) / ${chunk_size}") + # Object libraries (chunked) per trait + set(sub_intermediate_libs) + set(chunk_size 3) + math(EXPR num_chunks "( ${total_files} + ${chunk_size} - 1 ) / ${chunk_size}") + math(EXPR num_chunks_minus_1 "${num_chunks} - 1") + + foreach(i RANGE 0 ${num_chunks_minus_1}) + math(EXPR start "${first} + ${i} * ${chunk_size} ") + math(EXPR end "${start} + ${chunk_size} - 1") - set(trait_obj_libs) - foreach(i RANGE 0 ${num_chunks}-1) - math(EXPR start "${i} * ${chunk_size}") - math(EXPR end "${start} + ${chunk_size} - 1") - - set(chunk_files) - foreach(j RANGE ${start} ${end}) - if (j LESS ${num_files}) - list(GET trait_files ${j} f) - list(APPEND chunk_files "${f}") - endif() - endforeach() - - if (chunk_files) - set(obj_lib_name "gemm_obj_${trait}_${i}_${datatype}_${layout}") - add_library(${obj_lib_name} OBJECT ${chunk_files}) - add_dependencies(${obj_lib_name} gemm_gen_${datatype}_${layout}) - - target_compile_options(${obj_lib_name} PRIVATE - -Wno-undefined-func-template - -Wno-float-equal - --offload-compress - -O3 - -fno-exceptions - ) - - set_target_properties(${obj_lib_name} PROPERTIES - UNITY_BUILD ON - UNITY_BUILD_BATCH_SIZE 2 - ) - - list(APPEND trait_obj_libs "${obj_lib_name}") + set(chunk_files) + foreach(j RANGE ${start} ${end}) + if(j LESS ${last} AND j LESS ${codegen_blobs_len}) + list(GET codegen_blobs ${j} f) + list(APPEND chunk_files "${f}") endif() endforeach() - # Static library for this trait - if (trait_obj_libs) - set(trait_lib_name "gemm_lib_${trait}_${datatype}_${layout}") - set(obj_exprs) - foreach(objlib IN LISTS trait_obj_libs) - list(APPEND obj_exprs "$") - endforeach() - - add_library(${trait_lib_name} STATIC ${obj_exprs}) - add_dependencies(${trait_lib_name} gemm_gen_${datatype}_${layout}) - - # Trait-specific executable - set(exec_name "benchmark_gemm_${datatype}_${layout}_${trait}") - add_executable(${exec_name} benchmark_gemm.cpp) - target_link_libraries(${exec_name} PRIVATE ${trait_lib_name}) - target_include_directories(${exec_name} PRIVATE - "${CMAKE_CURRENT_LIST_DIR}" - "${working_path}" - ) - target_compile_definitions(${exec_name} PRIVATE - GEMM_TRAIT_FILTER="${trait}" - ) - target_compile_options(${exec_name} PRIVATE - -Wno-undefined-func-template - -Wno-float-equal - --offload-compress - ) + #list(LENGTH chunk_files chunk_files_len) + #if(chunk_files_len AND chunk_files_len GREATER 1) + if(chunk_files) + set(sub_intermediate_lib_name "gemm_objlib_${name}_${i}_${datatype}_${layout}") + add_library(${sub_intermediate_lib_name} OBJECT ${chunk_files}) + list(APPEND sub_intermediate_libs ${sub_intermediate_lib_name}) endif() - endif() - endforeach() - # Master executable including all traits - set(all_trait_libs) - foreach(trait IN LISTS unique_traits) - if (TARGET gemm_lib_${trait}_${datatype}_${layout}) - list(APPEND all_trait_libs "gemm_lib_${trait}_${datatype}_${layout}") - endif() - endforeach() + endforeach() - if (all_trait_libs) - add_executable(benchmark_gemm_${datatype}_${layout} benchmark_gemm.cpp) - target_link_libraries(benchmark_gemm_${datatype}_${layout} PRIVATE ${all_trait_libs}) - target_include_directories(benchmark_gemm_${datatype}_${layout} PRIVATE - "${CMAKE_CURRENT_LIST_DIR}" - "${working_path}" - ) - target_compile_options(benchmark_gemm_${datatype}_${layout} PRIVATE - -Wno-undefined-func-template - -Wno-float-equal - --offload-compress - ) - endif() + # ------------------ Bundle the object libs into one static lib --------- + #list(LENGTH sub_intermediate_libs sub_intermediate_libs_len) + #if(sub_intermediate_libs AND sub_intermediate_libs_len GREATER 1) + if(sub_intermediate_libs) + set(intermediate_lib_name "gemm_staticlib_${name}_${datatype}_${layout}") + # Collect the $ expressions + + set(obj_exprs) + foreach(objlib IN LISTS sub_intermediate_libs) + list(APPEND obj_exprs $) + endforeach() + + add_library(${intermediate_lib_name} STATIC ${obj_exprs}) + add_dependencies(${intermediate_lib_name} gemm_gen_${datatype}_${layout}) + #foreach(objlib IN LISTS sub_intermediate_libs) + # target_sources(${intermediate_lib_name} PRIVATE $) + #endforeach() + list(APPEND intermediate_libs ${intermediate_lib_name}) + endif() + + endforeach() + + # Interface library for instances + add_library(gemm_template_instances_${datatype}_${layout} INTERFACE) + add_dependencies(gemm_template_instances_${datatype}_${layout} gemm_gen_${datatype}_${layout}) + target_link_libraries(gemm_template_instances_${datatype}_${layout} INTERFACE ${intermediate_libs}) + target_include_directories(gemm_template_instances_${datatype}_${layout} INTERFACE + ${CMAKE_CURRENT_LIST_DIR} + "${working_path}" + ) + set_target_properties(gemm_template_instances_${datatype}_${layout} PROPERTIES LINKER_LANGUAGE CXX) + + # Host API interface library + add_library(gemm_host_api_${datatype}_${layout} INTERFACE) + target_link_libraries(gemm_host_api_${datatype}_${layout} INTERFACE gemm_template_instances_${datatype}_${layout}) + target_include_directories(gemm_host_api_${datatype}_${layout} INTERFACE + ${CMAKE_CURRENT_LIST_DIR} + "${working_path}" + ) + + + # Executable per datatype + set(exec_name "benchmark_gemm_${datatype}_${layout}") + add_executable(${exec_name} benchmark_gemm.cpp) + target_link_libraries(${exec_name} PRIVATE gemm_host_api_${datatype}_${layout}) + target_compile_options(${exec_name} PRIVATE + -Wno-undefined-func-template + -Wno-float-equal + --offload-compress + ) endfunction() -# Process each datatype/layout +# Process each datatype in isolation foreach(dt IN LISTS GEMM_DATATYPE) foreach(l IN LISTS GEMM_LAYOUT) - build_gemm_for_datatype("${dt}" "${l}") + build_gemm_for_datatype(${dt} ${l}) endforeach() endforeach() - -# Master target for parallel builds -set(ALL_GEMM_TARGETS) -foreach(dt IN LISTS GEMM_DATATYPE) - foreach(l IN LISTS GEMM_LAYOUT) - list(APPEND ALL_GEMM_TARGETS "benchmark_gemm_${dt}_${l}") - endforeach() -endforeach() -add_custom_target(benchmark_gemm_all DEPENDS ${ALL_GEMM_TARGETS}) - -# Use faster linker if available -find_program(LLD_LINKER "ld.lld") -find_program(MOLD_LINKER "mold") -if (MOLD_LINKER) - message(STATUS "Using mold linker for faster linking") - add_link_options(-fuse-ld=mold) -elseif (LLD_LINKER) - message(STATUS "Using lld linker for faster linking") - add_link_options(-fuse-ld=lld) -endif() \ No newline at end of file From 07469142cb887dd7569aae24cc264f95c8339b0e Mon Sep 17 00:00:00 2001 From: Thomas Ning Date: Wed, 6 Aug 2025 00:34:39 -0700 Subject: [PATCH 09/21] delete all slp compilation flag in CK Tile (#2625) --- example/65_gemm_multiply_multiply/CMakeLists.txt | 13 ++++++------- example/67_gemm_microscaling/CMakeLists.txt | 2 +- example/ck_tile/03_gemm/CMakeLists.txt | 2 +- .../gpu/gemm_blockscale_wp/CMakeLists.txt | 16 ++++++++-------- 4 files changed, 16 insertions(+), 17 deletions(-) diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt index 9f4c43338e..d1e1a51afd 100644 --- a/example/65_gemm_multiply_multiply/CMakeLists.txt +++ b/example/65_gemm_multiply_multiply/CMakeLists.txt @@ -31,7 +31,7 @@ foreach(gpu IN LISTS GPU_TARGETS) example_compile_options(example_moe_gemm1_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS}) example_compile_options(example_moe_gemm2_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS}) endif() - set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32") + set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1") example_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS}) example_compile_options(example_moe_gemm1_xdl_fp8 PRIVATE ${GEMM_OPTIONS}) example_compile_options(example_moe_gemm2_xdl_fp8 PRIVATE ${GEMM_OPTIONS}) @@ -39,22 +39,22 @@ foreach(gpu IN LISTS GPU_TARGETS) endif() endforeach() -set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32") +set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1") set(BLOCKSCALE_GEMM_OPTIONS ) check_cxx_compiler_flag("-mllvm --misched-bottomup=1" HAS_MISCHED_BOTTOMUP) check_cxx_compiler_flag("-mllvm --misched-prera-direction=bottomup" HAS_MISCHED_PRERA_DIRECTION) if(hip_VERSION_FLAT LESS 600443483 OR hip_VERSION_FLAT GREATER_EQUAL 700000000) if(HAS_MISCHED_BOTTOMUP) - list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-bottomup=1") + list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --schedmodel=0 -mllvm --misched-bottomup=1") elseif(HAS_MISCHED_PRERA_DIRECTION) - list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-prera-direction=bottomup") + list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --schedmodel=0 -mllvm --misched-prera-direction=bottomup") endif() else() if(HAS_MISCHED_BOTTOMUP) - list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --misched-bottomup=1") + list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --misched-bottomup=1") elseif(HAS_MISCHED_PRERA_DIRECTION) - list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --misched-prera-direction=bottomup") + list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --misched-prera-direction=bottomup") endif() endif() @@ -62,7 +62,6 @@ check_cxx_compiler_flag("-mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupa if(HAS_MAX_OCCUPANCY_EXPERIMENTAL) list(APPEND BLOCKSCALE_GEMM_OPTIONS -mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupancy-experimental) endif() -# list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --misched-bottomup=1") example_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS}) example_compile_options(example_moe_gemm1_xdl_fp8 PRIVATE ${GEMM_OPTIONS}) example_compile_options(example_moe_gemm2_xdl_fp8 PRIVATE ${GEMM_OPTIONS}) diff --git a/example/67_gemm_microscaling/CMakeLists.txt b/example/67_gemm_microscaling/CMakeLists.txt index 14b648c9f8..6ee43aac62 100644 --- a/example/67_gemm_microscaling/CMakeLists.txt +++ b/example/67_gemm_microscaling/CMakeLists.txt @@ -58,7 +58,7 @@ example_compile_options(example_moe_gemm1_xdl_mx_fp4_bpreshuffle PRIVATE ${FP4_M example_compile_options(example_moe_gemm2_xdl_mx_fp4_bpreshuffle PRIVATE ${FP4_MXGEMM_OPTIONS}) set(FP8_MXGEMM_OPTIONS) -list(APPEND FP8_MXGEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32") +list(APPEND FP8_MXGEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1") example_compile_options(example_gemm_mx_fp8 PRIVATE ${FP8_MXGEMM_OPTIONS}) example_compile_options(example_gemm_mx_bf8 PRIVATE ${FP8_MXGEMM_OPTIONS}) diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt index e6f67e4c76..b1aede42c7 100644 --- a/example/ck_tile/03_gemm/CMakeLists.txt +++ b/example/ck_tile/03_gemm/CMakeLists.txt @@ -10,7 +10,7 @@ list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-unused-local-typedef) list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-gnu-line-marker) list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS --save-temps) -list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm -enable-noalias-to-md-conversion=0") +list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm -enable-noalias-to-md-conversion=0") target_compile_options(tile_example_gemm_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) target_compile_options(tile_example_gemm_universal PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) target_compile_options(tile_example_gemm_weight_preshuffle PRIVATE ${EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS}) diff --git a/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt index c8740e8d8c..0ffe5f95b2 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt @@ -10,14 +10,14 @@ list(APPEND GEMM_BLOCKSCALE_WP_INSTANCES check_cxx_compiler_flag("-mllvm --misched-bottomup=1" HAS_MISCHED_BOTTOMUP) check_cxx_compiler_flag("-mllvm --misched-prera-direction=bottomup" HAS_MISCHED_PRERA_DIRECTION) if(HAS_MISCHED_BOTTOMUP) - set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1") - set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1") - set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1") - set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1") + set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-bottomup=1") + set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-bottomup=1") + set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-bottomup=1") + set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-bottomup=1") elseif(HAS_MISCHED_PRERA_DIRECTION) - set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-prera-direction=bottomup") - set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-prera-direction=bottomup") - set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-prera-direction=bottomup") - set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-prera-direction=bottomup") + set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-prera-direction=bottomup") + set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-prera-direction=bottomup") + set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-prera-direction=bottomup") + set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-prera-direction=bottomup") endif() add_instance_library(device_gemm_blockscale_wp_instance ${GEMM_BLOCKSCALE_WP_INSTANCES}) From 15e8b6ccf7220fa11c7497348e3c877c59e3b013 Mon Sep 17 00:00:00 2001 From: Yi DING Date: Wed, 6 Aug 2025 20:04:23 +0800 Subject: [PATCH 10/21] [CK_TILE] Fix FMHA qr_async causing errors in FA (#2627) --- .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py | 33 ++++++++++++------- .../01_fmha/codegen/ops/fmha_fwd_splitkv.py | 28 +++++----------- 2 files changed, 30 insertions(+), 31 deletions(-) diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py index 730641a6b0..269af4e6a7 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py @@ -533,20 +533,31 @@ class KernelComponentFactory: pipelines = [] if dtype in ['fp16', 'bf16']: for logits, mask, bias, lse, dropout, skip in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"]): - if bias == "bias": - # TODO: rocm 6.2 compiler problem if using qr_async for bias case + if hdim == 256 and hdim_v == 256: + # if True: pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip)) - pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip)) + # the below two is used for hdim vectorize load + pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip)) + pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip)) + + pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) else: - pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) - pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) - pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) - pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) - if receipt == 1 and bias != "bias": - pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim - pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim + if bias == "bias": + # TODO: rocm 6.2 compiler problem if using qr_async for bias case + pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip)) + pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) + pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip)) + pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) + else: + pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) + pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) + pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) + pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) + if receipt == 1 and bias != "bias": + pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim + pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim elif dtype in ['fp8', 'bf8']: # no need lse/dropout kernels for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()): @@ -584,7 +595,7 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl if pipeline.F_spad != 't' or pipeline.F_skpad != 't': # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not continue - if (hdim, hdim_v) == (192, 128) or hdim == 160: + if (hdim, hdim_v) == (192, 128): # NOTE: this is used to speedup deepseek prefill case, we don't gen training if pipeline.F_bias != 'no' or pipeline.F_dropout == 't': continue diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py index 5b35e7f0bd..0e4ac44d45 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py @@ -41,7 +41,6 @@ K0_MAX_SUBMAX_MAP = { FMHA_FWD_SPLITKV_PIPELINE_MAP = { "qr" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVS", "qr_nwarp_sshuffle" : "ck_tile::BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS", - "qr_async" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVSAsync", } FMHA_FWD_SPLITKV_KERNEL_BODY=""" @@ -685,28 +684,17 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl, opt pipelines = [] if dtype in ['fp16', 'bf16']: for logits, mask, bias, pagedkv in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"]): - # TODO: use async pipeline when compiler is more stable - if hdim == 256 or hdim in [32, 64, 128]: ### [32, 64, 96, 128, 160]: - # if True: - pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr', 'row', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr', 'col', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'row', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'col', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask)) - else: - pipelines.append(Pipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask)) - if receipt == 1: - pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim - pipelines.append(Pipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim + pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask)) elif dtype in ['fp8', 'bf8']: for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()): pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, 't', squant, 'f', mask)) From 2622ff06cb2aabfd94df191083777b4caeb03966 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Wed, 6 Aug 2025 15:16:12 +0200 Subject: [PATCH 11/21] Remove unused lds direct load instruction. (#2573) This functionality is replaced by amd_async_buffer_load Co-authored-by: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Co-authored-by: Aviral Goel --- .../core/arch/amd_buffer_addressing.hpp | 48 ------------------- include/ck_tile/core/arch/arch.hpp | 16 ------- 2 files changed, 64 deletions(-) diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index 29cc3fefe5..35da19cd3e 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -2754,54 +2754,6 @@ CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer& src_thread_ #endif } -template -CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr, - const index_t global_offset, - T* lds_base_ptr, - const index_t lds_offset, - const bool is_valid, - const index_t src_element_space_size) -{ - const uint32_t* global_ptr = - reinterpret_cast(reinterpret_cast(global_base_ptr)); - const int32x4_t src_resource = - make_wave_buffer_resource(global_ptr, src_element_space_size * sizeof(T)); - const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000; - -#if CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM - T* lds_ptr = lds_base_ptr + lds_offset; - auto const lds_ptr_sgpr = - __builtin_amdgcn_readfirstlane((reinterpret_cast(lds_ptr))); - asm volatile("s_mov_b32 m0, %0; \n\t" - "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr), - "v"(global_offset_bytes), - "s"(src_resource) - : "memory"); -#else - // Direct loads require that each thread reads and writes exactly a single DWORD. -#if defined(__gfx9__) - constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread; -#endif - // Direct loads require that each thread reads and writes a multiple of DWORDs (4 bytes). - // For gfx950: supports 1, 3, or 4 DWORDs per thread - // For gfx942: supports exactly 1 DWORD per thread -#if defined(__gfx950__) - constexpr auto dword_bytes = 4; - static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 || - bytes_per_thread == dword_bytes * 4); -#elif defined(__gfx9__) - constexpr auto dword_bytes = 4; - static_assert(bytes_per_thread == dword_bytes); -#endif - // LDS pointer must be attributed with the LDS address space. - as3_uint32_ptr lds_ptr = - reinterpret_cast(reinterpret_cast(lds_base_ptr + lds_offset)); - - llvm_amdgcn_raw_buffer_load_lds( - src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0); -#endif -} - #if defined(__gfx950__) template __device__ auto amd_transpose_load_to_vgpr(const T* in_ptr) diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp index 0723026836..96df9d70f7 100644 --- a/include/ck_tile/core/arch/arch.hpp +++ b/include/ck_tile/core/arch/arch.hpp @@ -174,22 +174,6 @@ CK_TILE_DEVICE void s_waitcnt_barrier() __builtin_amdgcn_s_barrier(); } -CK_TILE_DEVICE void block_sync_lds_direct_load() -{ -#if 1 - // invoke clang builtins which *should* produce the same result as the inline asm below - // difference: inline asm is being compiled to wait vmcnt(0) after the barrier - s_waitcnt_barrier<0, waitcnt_arg::kMaxExpCnt, 0>(); -#else - // same content as in old CK (#999) - asm volatile("\ - s_waitcnt vmcnt(0) \n \ - s_waitcnt lgkmcnt(0) \n \ - s_barrier \ - " ::); -#endif -} - CK_TILE_DEVICE void s_nop(index_t cnt = 0) { #if 1 From 4750b293fe0abfa44a32181742a48b1dfec468f7 Mon Sep 17 00:00:00 2001 From: Yashvardhan Agarwal Date: Wed, 6 Aug 2025 16:36:59 +0300 Subject: [PATCH 12/21] General 2D Reduction Kernel (#2535) * General 2D Reduction Kernel * Move the reduction kernel from the example * Split the code and add the necessary policy, problem, shape files as per ck_tile convention * Add/modify the headers * Modified the example to work with the 'new' kernel * Added tests for the kernel * N-D refernce reduce * Added support for N-D input with transform to 2D * Added padding to support various input sized tensors * Bug fix in the thread buffer constructor * Some comments to explain the reduce2d block kernel * comments resolution * clang-format * comments resolution * clang-format * clang-format * comments resolution * clang-format --- example/ck_tile/05_reduce/reduce.cpp | 63 ++- example/ck_tile/05_reduce/reduce.hpp | 164 -------- .../ck_tile/core/container/thread_buffer.hpp | 6 +- .../ck_tile/core/utility/reduce_operator.hpp | 57 ++- .../host/reference/reference_reduce.hpp | 78 ++++ include/ck_tile/ops/reduce.hpp | 5 +- .../ops/reduce/block/block_reduce2d.hpp | 72 +++- .../ops/reduce/kernel/reduce2d_kernel.hpp | 219 +++++++++++ .../reduce2d_default_policy.hpp} | 9 +- .../ops/reduce/pipeline/reduce2d_problem.hpp | 27 ++ .../ops/reduce/pipeline/reduce2d_shape.hpp | 37 ++ test/ck_tile/CMakeLists.txt | 1 + test/ck_tile/reduce/CMakeLists.txt | 7 + test/ck_tile/reduce/test_reduce2d.cpp | 359 ++++++++++++++++++ 14 files changed, 905 insertions(+), 199 deletions(-) delete mode 100644 example/ck_tile/05_reduce/reduce.hpp create mode 100644 include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp rename include/ck_tile/ops/reduce/{block/block_reduce2d_default_policy.hpp => pipeline/reduce2d_default_policy.hpp} (89%) create mode 100644 include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp create mode 100644 include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp create mode 100644 test/ck_tile/reduce/CMakeLists.txt create mode 100644 test/ck_tile/reduce/test_reduce2d.cpp diff --git a/example/ck_tile/05_reduce/reduce.cpp b/example/ck_tile/05_reduce/reduce.cpp index 602661f779..cf816caa88 100644 --- a/example/ck_tile/05_reduce/reduce.cpp +++ b/example/ck_tile/05_reduce/reduce.cpp @@ -1,16 +1,21 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include "ck_tile/host.hpp" -#include "reduce.hpp" +#include "ck_tile/ops/reduce.hpp" #include auto create_args(int argc, char* argv[]) { ck_tile::ArgParser arg_parser; - arg_parser.insert("m", "3328", "m dimension") - .insert("n", "4096", "n dimension") + arg_parser.insert("n", "32", "n dimension") + .insert("h", "7", "h dimension") + .insert("w", "7", "w dimension") + .insert("c", "512", "c dimension") .insert("v", "1", "cpu validation or not") .insert("prec", "fp16", "precision") - .insert("warmup", "5", "cold iter") - .insert("repeat", "20", "hot iter"); + .insert("warmup", "0", "cold iter") + .insert("repeat", "1", "hot iter"); bool result = arg_parser.parse(argc, argv); return std::make_tuple(result, arg_parser); @@ -23,15 +28,28 @@ bool run(const ck_tile::ArgParser& arg_parser) using ComputeDataType = float; using YDataType = DataType; - ck_tile::index_t m = arg_parser.get_int("m"); - ck_tile::index_t n = arg_parser.get_int("n"); + ck_tile::index_t N = arg_parser.get_int("n"); + ck_tile::index_t H = arg_parser.get_int("h"); + ck_tile::index_t W = arg_parser.get_int("w"); + ck_tile::index_t C = arg_parser.get_int("c"); int do_validation = arg_parser.get_int("v"); int warmup = arg_parser.get_int("warmup"); int repeat = arg_parser.get_int("repeat"); - ck_tile::HostTensor x_host({m, n}); - ck_tile::HostTensor y_host_ref({m}); - ck_tile::HostTensor y_host_dev({m}); + std::vector problem_shape = {N, H, W, C}; + std::vector strides(4); + strides[0] = H * W * C; + strides[1] = W * C; + strides[2] = C; + strides[3] = 1; + + // Define reduction specification: + constexpr auto kept_dim = ck_tile::sequence<0, 3>{}; // Which dimension to keep + constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; // Which dimensions to reduce + + ck_tile::HostTensor x_host(problem_shape, strides); + ck_tile::HostTensor y_host_ref({N, C}, {C, 1}); + ck_tile::HostTensor y_host_dev({N, C}, {C, 1}); ck_tile::FillUniformDistribution{-5.f, 5.f}(x_host); @@ -54,7 +72,9 @@ bool run(const ck_tile::ArgParser& arg_parser) constexpr ck_tile::index_t kBlockSize = 256; constexpr ck_tile::index_t kBlockPerCu = 1; - ck_tile::index_t kGridSize = (m / BlockTile::at(ck_tile::number<0>{})); + ck_tile::index_t kept_dim_len_prod = N * C; + ck_tile::index_t kGridSize = (kept_dim_len_prod + BlockTile::at(ck_tile::number<0>{}) - 1) / + BlockTile::at(ck_tile::number<0>{}); std::cout << "grid size " << kGridSize << std::endl; using Shape = ck_tile::Reduce2dShape; @@ -63,6 +83,17 @@ bool run(const ck_tile::ArgParser& arg_parser) using Kernel = ck_tile::Reduce; + // Create input tensor shape and strides + auto input_shape = + ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2], problem_shape[3]); + auto input_strides = ck_tile::make_tuple(strides[0], strides[1], strides[2], strides[3]); + + if(!Kernel::IsSupportedArgument( + C, input_strides)) // output tensor's continuous dimension and input strides + { + throw std::runtime_error("Wrong! Arguments not supported!\n"); + } + float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat}, ck_tile::make_kernel( Kernel{}, @@ -71,10 +102,12 @@ bool run(const ck_tile::ArgParser& arg_parser) 0, static_cast(x_buf.GetDeviceBuffer()), static_cast(y_buf.GetDeviceBuffer()), - m, - n)); + input_shape, + input_strides, + kept_dim, + reduce_dims)); - std::size_t num_btype = sizeof(XDataType) * m * n + sizeof(YDataType) * m; + std::size_t num_btype = sizeof(XDataType) * N * C * H * W + sizeof(YDataType) * N * C; float gb_per_sec = num_btype / 1.E6 / ave_time; @@ -86,7 +119,7 @@ bool run(const ck_tile::ArgParser& arg_parser) { // reference ck_tile::reference_reduce( - x_host, y_host_ref, ReduceOp{}); + x_host, y_host_ref, ReduceOp{}, kept_dim, reduce_dims); y_buf.FromDevice(y_host_dev.mData.data()); pass = ck_tile::check_err(y_host_dev, y_host_ref); diff --git a/example/ck_tile/05_reduce/reduce.hpp b/example/ck_tile/05_reduce/reduce.hpp deleted file mode 100644 index 6fbb0b4274..0000000000 --- a/example/ck_tile/05_reduce/reduce.hpp +++ /dev/null @@ -1,164 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck_tile/core.hpp" -#include "ck_tile/ops/common.hpp" -#include "ck_tile/ops/reduce/block/block_reduce.hpp" -#include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp" - -namespace ck_tile { - -template - typename BlockTile, // block size, seq - typename WarpTile, // warp size, seq - typename Vector> // contiguous pixels(vector size) along seq -struct Reduce2dShape -{ - static constexpr index_t Block_M = BlockTile::at(number<0>{}); - static constexpr index_t Block_N = BlockTile::at(number<1>{}); - - static constexpr index_t Warp_M = WarpTile::at(number<0>{}); - static constexpr index_t Warp_N = WarpTile::at(number<1>{}); - - static constexpr index_t Vector_M = Vector::at(number<0>{}); - static constexpr index_t Vector_N = Vector::at(number<1>{}); - - static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{}); - static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{}); - - static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M; - static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N; - - static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M); - static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N); - - static constexpr index_t BlockSize = - ck_tile::get_warp_size() * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{}); -}; - -template -struct Reduce2dProblem -{ - using XDataType = remove_cvref_t; - using ComputeDataType = remove_cvref_t; - using YDataType = remove_cvref_t; - using BlockShape = remove_cvref_t; - using ReduceOp = ReduceOp_; - - static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1; - static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1; -}; - -template -struct Reduce -{ - using Problem = ck_tile::remove_cvref_t; - using Policy = ck_tile::remove_cvref_t; - - using XDataType = ck_tile::remove_cvref_t; - using ComputeDataType = ck_tile::remove_cvref_t; - using YDataType = ck_tile::remove_cvref_t; - -#if 0 - CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N) - const - { - using S = typename Problem::BlockShape; - - const auto x_m_n = make_naive_tensor_view( - p_x, make_tuple(M, N), make_tuple(N, 1), number{}, number<1>{}); - - const auto y_m = make_naive_tensor_view_packed( - p_y, make_tuple(M), number<1>{}); - - const auto iM = get_block_id() * S::Block_M; - - auto x_window = make_tile_window(x_m_n, - make_tuple(number{}, number{}), - {iM, 0}, - Policy::template MakeXBlockTileDistribution()); - - auto y_window = make_tile_window(y_m, make_tuple(number{}), {iM}); - - const auto f_reduce = [](const auto& v0, const auto& v1) { return v0 + v1; }; - - const XDataType reduce_init_value = 0; - - constexpr auto reduce_dims = sequence<1>{}; - - auto y_compute = decltype(block_tile_reduce( - load_tile(x_window), reduce_dims, f_reduce, reduce_init_value)){}; - - set_tile(y_compute, reduce_init_value); - - index_t num_n_tile_iteration = - __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N)); - - for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) - { - const auto x = load_tile(x_window); - block_tile_reduce(y_compute, x, reduce_dims, f_reduce); - move_tile_window(x_window, {0, S::Block_N}); - } - - block_tile_reduce_sync(y_compute, f_reduce); - - store_tile(y_window, cast_tile(y_compute)); - } -#else - CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N) const - { - using S = typename Problem::BlockShape; - - const auto x_m_n = make_naive_tensor_view( - p_x, make_tuple(M, N), make_tuple(N, 1), number{}, number<1>{}); - - const auto y_m = make_naive_tensor_view_packed( - p_y, make_tuple(M), number<1>{}); - - const auto iM = get_block_id() * S::Block_M; - - auto x_window = make_tile_window(x_m_n, - make_tuple(number{}, number{}), - {iM, 0}, - Policy::template MakeXBlockTileDistribution()); - - auto y_window = make_tile_window(y_m, make_tuple(number{}), {iM}); - - __shared__ char smem[Policy::template GetSmemSize()]; - - index_t num_n_tile_iteration = - __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N)); - - auto reduce_func = typename Problem::ReduceOp{}; - auto block_reduce2d = Policy::template GetBlockReduce2d(); - auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); - auto block_reduce2d_cross_warp_sync = - Policy::template GetBlockReduce2dCrossWarpSync(); - - using XTensorType = decltype(load_tile(x_window)); - auto y_compute = block_reduce2d.template MakeYBlockTile(); - set_tile(y_compute, reduce_func.template GetIdentityValue()); - - for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) - { - const auto x = load_tile(x_window); - block_reduce2d(x, y_compute, reduce_func); - move_tile_window(x_window, {0, S::Block_N}); - } - - block_reduce2d_sync(y_compute, reduce_func); - block_reduce2d_cross_warp_sync(y_compute, smem, reduce_func); - - store_tile(y_window, cast_tile(y_compute)); - } -#endif -}; - -} // namespace ck_tile diff --git a/include/ck_tile/core/container/thread_buffer.hpp b/include/ck_tile/core/container/thread_buffer.hpp index 77c46e1b8c..d67581e7d2 100644 --- a/include/ck_tile/core/container/thread_buffer.hpp +++ b/include/ck_tile/core/container/thread_buffer.hpp @@ -42,7 +42,11 @@ struct thread_buffer { // TODO: this ctor can't ignore CK_TILE_HOST_DEVICE constexpr thread_buffer() : data{} {} - CK_TILE_HOST_DEVICE constexpr thread_buffer(const value_type & o) : data{o} {} + CK_TILE_HOST_DEVICE constexpr thread_buffer(const value_type & o) : data{} { + static_for<0, N, 1>{}( + [&](auto i) { data[i] = o; } + ); + } CK_TILE_HOST_DEVICE static constexpr auto size() { return N; } CK_TILE_HOST_DEVICE auto & get() {return data; } diff --git a/include/ck_tile/core/utility/reduce_operator.hpp b/include/ck_tile/core/utility/reduce_operator.hpp index 8b15d187fe..2d7ac78b06 100644 --- a/include/ck_tile/core/utility/reduce_operator.hpp +++ b/include/ck_tile/core/utility/reduce_operator.hpp @@ -26,7 +26,8 @@ struct Add } template || std::is_same_v>> + typename = std::enable_if_t || std::is_same_v || + std::is_same_v || std::is_same_v>> CK_TILE_HOST_DEVICE constexpr T operator()(T& y, T x) const { float y_ = type_convert(y); @@ -34,6 +35,8 @@ struct Add return type_convert(y_ + x_); } + + static constexpr bool requires_special_combine = false; }; struct SquareAdd @@ -51,13 +54,47 @@ struct SquareAdd { return y + (x * x); } + + template || std::is_same_v || + std::is_same_v || std::is_same_v>> + CK_TILE_HOST_DEVICE constexpr T operator()(T& y, T x) const + { + float y_ = type_convert(y); + float x_ = type_convert(x); + return type_convert(y_ + (x_ * x_)); + } + + // For combining partial results + template || std::is_same_v || + std::is_same_v || std::is_same_v>> + CK_TILE_HOST_DEVICE constexpr T combine_partial_results(const T& partial1, + const T& partial2) const + { + return partial1 + partial2; // Just add the partial sums, don't square again + } + + template || std::is_same_v || + std::is_same_v || std::is_same_v>> + CK_TILE_HOST_DEVICE constexpr T combine_partial_results(T& partial1, T& partial2) const + { + float partial1_ = type_convert(partial1); + float partial2_ = type_convert(partial2); + return type_convert(partial1_ + partial2_); + } + + static constexpr bool requires_special_combine = true; }; struct Max { template || std::is_same_v || - std::is_same_v || std::is_same_v>> + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v>> CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue() { return numeric::min(); @@ -65,18 +102,24 @@ struct Max template || std::is_same_v || - std::is_same_v || std::is_same_v>> + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v>> CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const { return max(y, x); } + + static constexpr bool requires_special_combine = false; }; struct AbsMax { template || std::is_same_v || - std::is_same_v || std::is_same_v>> + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v>> CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue() { return numeric::min(); @@ -84,11 +127,15 @@ struct AbsMax template || std::is_same_v || - std::is_same_v || std::is_same_v>> + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v>> CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const { return max(y, abs(x)); } + + static constexpr bool requires_special_combine = false; }; } // namespace ReduceOp diff --git a/include/ck_tile/host/reference/reference_reduce.hpp b/include/ck_tile/host/reference/reference_reduce.hpp index 8f8aa23670..9952b7b009 100644 --- a/include/ck_tile/host/reference/reference_reduce.hpp +++ b/include/ck_tile/host/reference/reference_reduce.hpp @@ -30,4 +30,82 @@ reference_reduce(const HostTensor& x_m_n, HostTensor& y_m, make_ParallelTensorFunctor(f, y_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency()); } + +// Generic reference reduce for arbitrary dimensions +template < + typename XDataType, + typename ComputeDataType, + typename YDataType, + typename ReduceOp, + typename KeptDim, // Expected type: ck_tile::sequence<...> containing dimension indices to keep + typename ReduceDims> // Expected type: ck_tile::sequence<...> containing dimension indices to + // reduce +CK_TILE_HOST void reference_reduce(const HostTensor& x_tensor, + HostTensor& y_tensor, + ReduceOp reduce_op, + KeptDim kept_dim, + ReduceDims reduce_dims) +{ + const auto& x_lengths = x_tensor.mDesc.get_lengths(); + + // Calculate total kept elements (product of all kept dimension lengths) + index_t total_kept_elements = 1; + static_for<0, kept_dim.size(), 1>{}( + [&](auto i) { total_kept_elements *= x_lengths[kept_dim.at(i)]; }); + + // Calculate total reduce elements (product of all reduce dimension lengths) + index_t total_reduce_elements = 1; + static_for<0, reduce_dims.size(), 1>{}( + [&](auto i) { total_reduce_elements *= x_lengths[reduce_dims.at(i)]; }); + + auto f = [&](auto linear_kept_idx) { + ComputeDataType v_acc = reduce_op.template GetIdentityValue(); + + // Convert linear kept index to multi-dimensional kept indices + std::vector kept_indices(kept_dim.size()); + index_t temp_kept = linear_kept_idx; + static_for<0, kept_dim.size(), 1>{}([&](auto i) { + constexpr auto dim_idx = kept_dim.size() - 1 - i; + constexpr auto dim = kept_dim.at(dim_idx); + const auto len = x_lengths[dim]; + kept_indices[dim_idx] = temp_kept % len; + temp_kept /= len; + }); + + for(index_t reduce_idx = 0; reduce_idx < total_reduce_elements; ++reduce_idx) + { + // Convert linear reduce index to multi-dimensional reduce indices + std::vector reduce_indices(reduce_dims.size()); + index_t temp_reduce = reduce_idx; + static_for<0, reduce_dims.size(), 1>{}([&](auto i) { + constexpr auto dim_idx = reduce_dims.size() - 1 - i; + constexpr auto dim = reduce_dims.at(dim_idx); + const auto len = x_lengths[dim]; + reduce_indices[dim_idx] = temp_reduce % len; + temp_reduce /= len; + }); + + // Build full input tensor indices by combining kept and reduce indices + std::vector full_indices(x_lengths.size(), 0); + static_for<0, kept_dim.size(), 1>{}( + [&](auto i) { full_indices[kept_dim.at(i)] = kept_indices[i]; }); + static_for<0, reduce_dims.size(), 1>{}( + [&](auto i) { full_indices[reduce_dims.at(i)] = reduce_indices[i]; }); + + // Access input tensor element + const auto v_a = type_convert(x_tensor(full_indices)); + + v_acc = reduce_op(v_acc, v_a); + } + + // Calculate output tensor index using kept indices + // The output tensor has the same structure as the kept dimensions + std::vector y_indices(kept_dim.size()); + static_for<0, kept_dim.size(), 1>{}([&](auto i) { y_indices[i] = kept_indices[i]; }); + + y_tensor(y_indices) = type_convert(v_acc); + }; + + make_ParallelTensorFunctor(f, total_kept_elements)(std::thread::hardware_concurrency()); +} } // namespace ck_tile diff --git a/include/ck_tile/ops/reduce.hpp b/include/ck_tile/ops/reduce.hpp index 80ead84e85..042e0b98c2 100644 --- a/include/ck_tile/ops/reduce.hpp +++ b/include/ck_tile/ops/reduce.hpp @@ -5,8 +5,11 @@ #include "ck_tile/ops/reduce/block/block_reduce.hpp" #include "ck_tile/ops/reduce/block/block_reduce2d.hpp" -#include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp" #include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" +#include "ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp" +#include "ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp" +#include "ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp" +#include "ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp" diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp index 62c9944bd2..849fa6c252 100644 --- a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp +++ b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp @@ -7,20 +7,55 @@ namespace ck_tile { +// BlockReduce2d implements a hierarchical 2D reduction operator that reduces data along the second +// dimension using a user-specified reduction function. +// +// The reduction is performed in a three-stage hierarchical approach: +// +// STAGE 1: Thread-level reduction (BlockReduce2d) +// =============================================== +// - Each thread processes multiple elements from the input tensor within its assigned data +// partition +// - Reduction is performed locally within each thread by iterating over assigned elements +// - ReducePacksPerXDim controls how many elements sweep_tile processes in one iteration per +// dimension +// (e.g., {1,1} = 1 element at a time from each dimension, {2,4} = 2 from dim0, 4 from dim1) +// - Results are accumulated into a thread-local output tensor stored in registers +// - The output tensor distribution is derived from the input tensor's distribution using +// make_reduce_tile_distribution_encoding() to handle dimension reduction +// +// STAGE 2: Warp-level reduction (BlockReduce2dSync) +// ================================================ +// - Performs inter-thread reduction within each warp +// - Uses warp shuffle operations to exchange data between threads in the same warp +// - Implements a tree-reduction pattern with power-of-2 stages +// - Only reduces along dimensions that map to lane IDs within the warp +// +// STAGE 3: Cross-warp reduction (BlockReduce2dCrossWarpSync) +// ======================================================== +// - Performs reduction across multiple warps within the same thread block +// - Uses shared memory (LDS) to facilitate data exchange between warps +// - Each warp's lane-0 thread stores its partial results to shared memory +// - All threads participate in loading and reducing data from shared memory +// - Implements block-level synchronization to ensure memory consistency + +// BlockReduce2d: Thread-level reduction (Stage 1) template struct BlockReduce2d { - // in-thread reduction + // Thread-level reduction implementation using Problem = remove_cvref_t; using XDataType = typename Problem::XDataType; using ComputeDataType = typename Problem::ComputeDataType; CK_TILE_DEVICE constexpr BlockReduce2d() {} - template > + template < + typename XDistributedTensor_, + typename YDistributedTensor_, + typename ReduceFunc, + typename ReducePacksPerXDim = + uniform_sequence_gen_t<2, 1>> // {1,1} = process 1 element at a time from each dimension CK_TILE_DEVICE void operator()(const XDistributedTensor_& x_tensor, YDistributedTensor_& y_tensor, const ReduceFunc& reduce_func, @@ -33,6 +68,7 @@ struct BlockReduce2d y_tensor(idx_0), ck_tile::type_convert(x_tensor[idx_])...); }, ReducePacksPerXDim{}); + #if 0 constexpr auto I0 = number<0>{}; constexpr auto I1 = number<1>{}; @@ -75,6 +111,8 @@ struct BlockReduce2d return tensor; } + // uniform_sequence_gen_t generates sequence of NSize elements filled with Value + // e.g., uniform_sequence_gen_t<2, 1> → {1, 1} and uniform_sequence_gen_t<3, 4> → {4, 4, 4} template > @@ -91,6 +129,7 @@ struct BlockReduce2d } }; +// BlockReduce2dSync: Warp-level reduction (Stage 2) template struct BlockReduce2dSync { @@ -145,8 +184,15 @@ struct BlockReduce2dSync // pull data from remote lane const auto v_remote = warp_shuffle(v_local, src_lane); - // reduce - v_local = reduce_func(v_local, v_remote); + // For reduce, use combine_partial_results for operations that require it + if constexpr(ReduceFunc::requires_special_combine) + { + v_local = reduce_func.combine_partial_results(v_local, v_remote); + } + else + { + v_local = reduce_func(v_local, v_remote); + } }); } }); @@ -157,6 +203,7 @@ struct BlockReduce2dSync } }; +// BlockReduce2dCrossWarpSync: Cross-warp reduction (Stage 3) template struct BlockReduce2dCrossWarpSync { @@ -263,8 +310,15 @@ struct BlockReduce2dCrossWarpSync constexpr auto i_1 = number{}; const DataType v_remote = all_scratch[i_0 * num_reduce_warps + i_1]; - // reduce - v_local = reduce_func(v_local, v_remote); + // For reduce, use combine_partial_results for operations that require it + if constexpr(ReduceFunc::requires_special_combine) + { + v_local = reduce_func.combine_partial_results(v_local, v_remote); + } + else + { + v_local = reduce_func(v_local, v_remote); + } }); y_tensor.get_thread_buffer()(i_0) = v_local; diff --git a/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp new file mode 100644 index 0000000000..f65487ea6e --- /dev/null +++ b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" +#include "ck_tile/ops/reduce/block/block_reduce.hpp" +#include "ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp" + +// Reduce2d Kernel: +// ======================================= +// This kernel implements a 2D reduction operation that reduces data along the second dimension +// of a matrix. The reduction is performed in multiple hierarchical stages. + +namespace ck_tile { + +template +struct Reduce +{ + using Problem = ck_tile::remove_cvref_t; + using Policy = ck_tile::remove_cvref_t; + + using XDataType = ck_tile::remove_cvref_t; + using ComputeDataType = ck_tile::remove_cvref_t; + using YDataType = ck_tile::remove_cvref_t; + + private: + // Helper function to calculate optimal vector size for input tensor + template + static constexpr index_t CalculateInputVectorSize() + { + using S = typename Problem::BlockShape; + constexpr index_t memory_vector_size = 16 / sizeof(XDataType); + constexpr index_t thread_tile_vector_size = S::ThreadTile_N; + + // Check if innermost reduce dimension is the last dimension (stride 1). + constexpr auto innermost_reduce_dim = ReduceDims{}.at(number{}); + constexpr bool is_innermost_contiguous = (innermost_reduce_dim == InputShape{}.size() - 1); + + // If innermost reduce dimension is not the last dim (not contiguous), limit vectorization + constexpr index_t stride_based_vector_size = + is_innermost_contiguous ? ck_tile::min(memory_vector_size, thread_tile_vector_size) : 1; + + return stride_based_vector_size; + } + + // Helper function to calculate optimal vector size for output tensor + static constexpr index_t CalculateOutputVectorSize() + { + using S = typename Problem::BlockShape; + constexpr index_t memory_vector_size = 16 / sizeof(YDataType); + constexpr index_t thread_tile_vector_size = S::ThreadTile_M; + constexpr index_t vector_size = ck_tile::min(memory_vector_size, thread_tile_vector_size); + + return vector_size; + } + + public: + template + CK_TILE_DEVICE void operator()(const XDataType* p_x, + YDataType* p_y, + InputShape input_shape, + InputStrides input_strides, + KeptDim kept_dim, + ReduceDims reduce_dims) const + { + using S = typename Problem::BlockShape; + const auto iM = get_block_id() * S::Block_M; + + static_assert(kept_dim.size() + reduce_dims.size() == InputShape::size(), + "Size of kept dimensions + reduced dimensions must equal input tensor rank"); + + // Extract lengths based on kept and reduced dimensions + const auto kept_lens = [&]() { + return generate_tuple([&](auto I) { return input_shape.at(number{}); }, + number{}); + }(); + const auto reduce_lens = [&]() { + return generate_tuple( + [&](auto I) { return input_shape.at(number{}); }, + number{}); + }(); + + const auto kept_merge_transform = make_merge_transform(kept_lens); + const auto reduce_merge_transform = make_merge_transform(reduce_lens); + + auto reduce_func = typename Problem::ReduceOp{}; + const XDataType custom_padding_value = + type_convert(reduce_func.template GetIdentityValue()); + + // Calculate optimal vector size for input tensor + constexpr auto x_tensor_vector_size = CalculateInputVectorSize(); + + // Create input tensor view with custom padding value + auto desc = make_naive_tensor_descriptor( + input_shape, input_strides, number{}, number<1>{}); + + // Create buffer view with custom padding value + auto buffer_view = make_buffer_view( + p_x, desc.get_element_space_size(), custom_padding_value); + + // Create tensor view with custom padding + const auto x_tensor = tensor_view{buffer_view, desc}; + const auto transformed_x_tensor = pad_tensor_view( + transform_tensor_view(x_tensor, + make_tuple(kept_merge_transform, reduce_merge_transform), + make_tuple(kept_dim, reduce_dims), + make_tuple(sequence<0>{}, sequence<1>{})), + make_tuple(number{}, number{}), + sequence<0, 1>{}); + + // Calculate strides for output tensor based on its own dimensions + const auto kept_strides = [&]() { + return generate_tuple( + [&](auto I) { + // Calculate stride for dimension I as product of all following dimensions + index_t stride = 1; + static_for{}( + [&](auto J) { stride *= kept_lens.at(number{}); }); + return stride; + }, + number{}); + }(); + + // Calculate optimal vector size for output tensor + constexpr auto y_tensor_vector_size = CalculateOutputVectorSize(); + + const auto y_m = make_naive_tensor_view( + p_y, kept_lens, kept_strides, number{}, number<1>{}); + + // Transform output tensor to 1D merged view + // This creates a view compatible with the 2D reduction pattern + const auto y_merged = transform_tensor_view( + y_m, + make_tuple(kept_merge_transform), + make_tuple(typename arithmetic_sequence_gen<0, kept_dim.size(), 1>::type{}), + make_tuple(sequence<0>{})); + + auto x_window = make_tile_window(transformed_x_tensor, + make_tuple(number{}, number{}), + {iM, 0}, + Policy::template MakeXBlockTileDistribution()); + + auto y_window = make_tile_window(y_merged, make_tuple(number{}), {iM}); + + __shared__ char smem[Policy::template GetSmemSize()]; + + // Get the merged dimension size from the transformed tensor + const auto merged_reduce_len = + transformed_x_tensor.get_tensor_descriptor().get_lengths().at(number<1>{}); + index_t num_n_tile_iteration = + __builtin_amdgcn_readfirstlane(integer_divide_ceil(merged_reduce_len, S::Block_N)); + + auto block_reduce2d = Policy::template GetBlockReduce2d(); + auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); + auto block_reduce2d_cross_warp_sync = + Policy::template GetBlockReduce2dCrossWarpSync(); + + using XTensorType = decltype(load_tile(x_window)); + auto y_compute = block_reduce2d.template MakeYBlockTile(); + set_tile(y_compute, reduce_func.template GetIdentityValue()); + + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + { + const auto x = load_tile(x_window); + block_reduce2d(x, y_compute, reduce_func); + move_tile_window(x_window, {0, S::Block_N}); + } + + block_reduce2d_sync(y_compute, reduce_func); + block_reduce2d_cross_warp_sync(y_compute, smem, reduce_func); + + store_tile(y_window, cast_tile(y_compute)); + } + + /// @brief Validates if the given arguments are supported by the 2D reduction kernel. + /// + /// @param y_continous_dim Size of the continuous dimension of the output tensor. + /// Must be a multiple of ThreadTile_N for proper thread mapping. + /// + /// @param input_strides The stride configuration of the input tensor. + /// The last stride must be 1 to ensure contiguous memory access + /// and enable efficient vectorized loads. + /// + /// @return true if the arguments are supported, false otherwise. + /// Error messages are logged when CK_TILE_LOGGING is enabled. + /// + /// @note Requirements: + /// - y_continous_dim % ThreadTile_N == 0 (for proper thread distribution) + /// - input_strides[-1] == 1 (for contiguous memory access) + CK_TILE_HOST static bool IsSupportedArgument(index_t y_continous_dim, auto input_strides) + { + using S = typename Problem::BlockShape; + + if(y_continous_dim % S::ThreadTile_N != 0) + { + if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) + { + CK_TILE_ERROR("Total reduction size should be a multiple of ThreadTile_N!"); + } + return false; + } + + if(input_strides.at(number{}) != 1) + { + if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) + { + CK_TILE_ERROR( + "Input tensor's last stride must be 1 to support correct vector access!"); + } + return false; + } + + return true; + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp b/include/ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp similarity index 89% rename from include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp rename to include/ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp index 3c547242d5..27bb4bcdcb 100644 --- a/include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp +++ b/include/ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -9,7 +9,7 @@ namespace ck_tile { -struct BlockReduce2dDefaultPolicy +struct Reduce2dDefaultPolicy { template CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution() @@ -18,8 +18,9 @@ struct BlockReduce2dDefaultPolicy return make_static_tile_distribution( tile_distribution_encoding< sequence<>, - tuple, - sequence>, + tuple< + sequence, + sequence>, tuple, sequence<1, 2>>, tuple, sequence<2, 2>>, sequence<1, 1, 2, 2>, diff --git a/include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp b/include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp new file mode 100644 index 0000000000..67fdec9286 --- /dev/null +++ b/include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { + +template +struct Reduce2dProblem +{ + using XDataType = remove_cvref_t; + using ComputeDataType = remove_cvref_t; + using YDataType = remove_cvref_t; + using BlockShape = remove_cvref_t; + using ReduceOp = ReduceOp_; + + static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1; + static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1; +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp b/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp new file mode 100644 index 0000000000..31eb1f2f4f --- /dev/null +++ b/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { + +template + typename BlockTile, // block size, seq + typename WarpTile, // warp size, seq + typename ThreadTile> // contiguous pixels(vector size) along seq +struct Reduce2dShape +{ + static constexpr index_t Block_M = BlockTile::at(number<0>{}); + static constexpr index_t Block_N = BlockTile::at(number<1>{}); + + static constexpr index_t Warp_M = WarpTile::at(number<0>{}); + static constexpr index_t Warp_N = WarpTile::at(number<1>{}); + + static constexpr index_t ThreadTile_M = ThreadTile::at(number<0>{}); + static constexpr index_t ThreadTile_N = ThreadTile::at(number<1>{}); + + static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{}); + static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{}); + + static constexpr index_t ThreadPerWarp_M = Warp_M / ThreadTile_M; + static constexpr index_t ThreadPerWarp_N = Warp_N / ThreadTile_N; + + static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M); + static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N); + + static constexpr index_t BlockSize = + ck_tile::get_warp_size() * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{}); +}; +} // namespace ck_tile diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt index 42605f2513..9a1df56208 100644 --- a/test/ck_tile/CMakeLists.txt +++ b/test/ck_tile/CMakeLists.txt @@ -21,3 +21,4 @@ add_subdirectory(add_rmsnorm2d_rdquant) # add_subdirectory(layernorm2d) # add_subdirectory(rmsnorm2d) add_subdirectory(gemm_block_scale) +add_subdirectory(reduce) \ No newline at end of file diff --git a/test/ck_tile/reduce/CMakeLists.txt b/test/ck_tile/reduce/CMakeLists.txt new file mode 100644 index 0000000000..052669e20a --- /dev/null +++ b/test/ck_tile/reduce/CMakeLists.txt @@ -0,0 +1,7 @@ +if(GPU_TARGETS MATCHES "gfx9") + add_gtest_executable(test_ck_tile_reduce2d test_reduce2d.cpp) + if(result EQUAL 0) + target_link_libraries(test_ck_tile_reduce2d PRIVATE utility) + endif() +endif() + diff --git a/test/ck_tile/reduce/test_reduce2d.cpp b/test/ck_tile/reduce/test_reduce2d.cpp new file mode 100644 index 0000000000..4ce0b56ef3 --- /dev/null +++ b/test/ck_tile/reduce/test_reduce2d.cpp @@ -0,0 +1,359 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" +#include "ck_tile/ops/reduce.hpp" +#include "ck_tile/host/kernel_launch.hpp" + +template +class TestCkTileReduce : public ::testing::Test +{ + protected: + using XDataType = std::tuple_element_t<0, Tuple>; + using ComputeDataType = std::tuple_element_t<1, Tuple>; + using YDataType = std::tuple_element_t<2, Tuple>; + using ReduceOpType = std::tuple_element_t<3, Tuple>; + using BlockWarps_ = std::tuple_element_t<4, Tuple>; + using BlockTile_ = std::tuple_element_t<5, Tuple>; + using WarpTile_ = std::tuple_element_t<6, Tuple>; + using ThreadTile_ = std::tuple_element_t<7, Tuple>; + + using TestReduce2dShape = + ck_tile::Reduce2dShape; + + template + void RunGenericTest(const std::vector& input_shape, + const std::vector& input_strides, + const std::vector& output_shape, + const std::vector& output_strides, + ck_tile::index_t kept_dim_len_prod, + ck_tile::index_t total_reduce_elements, + KeptDimSeq kept_dims, + ReduceDimSeq reduce_dims) + { + ck_tile::HostTensor h_x(input_shape, input_strides); + ck_tile::HostTensor h_y(output_shape, output_strides); + ck_tile::HostTensor h_y_ref(output_shape, output_strides); + + ck_tile::FillUniformDistribution{-5.f, 5.f}(h_x); + h_y.SetZero(); + h_y_ref.SetZero(); + + ck_tile::DeviceMem d_x_mem(h_x.get_element_space_size_in_bytes()); + ck_tile::DeviceMem d_y_mem(h_y.get_element_space_size_in_bytes()); + + d_x_mem.ToDevice(h_x.data()); + d_y_mem.ToDevice(h_y.data()); // Initialize device output buffer + + // Problem and kernel setup + using Problem = ck_tile:: + Reduce2dProblem; + + using Kernel = ck_tile::Reduce; + + // Launch configuration + constexpr ck_tile::index_t kBlockSize = 256; + constexpr ck_tile::index_t kBlockPerCu = 1; + + ck_tile::index_t kGridSize = + (kept_dim_len_prod + TestReduce2dShape::Block_M - 1) / TestReduce2dShape::Block_M; + + // Generic helper to create tuple from vector based on compile-time size + auto make_shape_tuple = [](const std::vector& vec) { + return [&vec](std::index_sequence) { + return ck_tile::make_tuple(vec[I]...); + }(std::make_index_sequence{}); + }; + + auto input_shape_tuple = make_shape_tuple.template operator()(input_shape); + auto input_strides_tuple = make_shape_tuple.template operator()(input_strides); + + if(!Kernel::IsSupportedArgument( + output_shape[output_shape.size() - 1], + input_strides_tuple)) // output tensor's continuous dimension + { + throw std::runtime_error("Wrong! Arguments not supported!\n"); + } + + ck_tile::launch_kernel(ck_tile::stream_config{nullptr, false, 0}, + ck_tile::make_kernel( + Kernel{}, + kGridSize, + kBlockSize, + 0, + static_cast(d_x_mem.GetDeviceBuffer()), + static_cast(d_y_mem.GetDeviceBuffer()), + input_shape_tuple, + input_strides_tuple, + kept_dims, + reduce_dims)); + + // Get results back + d_y_mem.FromDevice(h_y.data()); + + // Reference computation + ck_tile::reference_reduce( + h_x, h_y_ref, ReduceOpType{}, kept_dims, reduce_dims); + + // Calculate proper error thresholds based on data types and number of accumulations + const auto rtol = ck_tile::get_relative_threshold( + total_reduce_elements); + const auto atol = ck_tile::get_absolute_threshold( + 5.0f, total_reduce_elements); + + bool result = + ck_tile::check_err(h_y, h_y_ref, "Error: Incorrect reduce results!", rtol, atol); + EXPECT_TRUE(result); + } + + // Convenience functions for specific dimensional patterns + void RunTest2D_KeepDim0_ReduceDim1(ck_tile::index_t dim0, ck_tile::index_t dim1) + { + constexpr auto kept_dims = ck_tile::sequence<0>{}; + constexpr auto reduce_dims = ck_tile::sequence<1>{}; + + // Input shape and strides + std::vector input_shape = {dim0, dim1}; + std::vector input_strides = {dim1, 1}; + + // Output shape and strides (keep dim0) + std::vector output_shape = {dim0}; + std::vector output_strides = {1}; + + // Calculate products + ck_tile::index_t kept_dim_len_prod = dim0; + ck_tile::index_t total_reduce_elements = dim1; + + RunGenericTest<2>(input_shape, + input_strides, + output_shape, + output_strides, + kept_dim_len_prod, + total_reduce_elements, + kept_dims, + reduce_dims); + } + + void RunTest3D_KeepDim0_ReduceDim12(ck_tile::index_t dim0, + ck_tile::index_t dim1, + ck_tile::index_t dim2) + { + constexpr auto kept_dims = ck_tile::sequence<0>{}; + constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; + + // Input shape and strides + std::vector input_shape = {dim0, dim1, dim2}; + std::vector input_strides = {dim1 * dim2, dim2, 1}; + + // Output shape and strides (keep dim0) + std::vector output_shape = {dim0}; + std::vector output_strides = {1}; + + // Calculate products + ck_tile::index_t kept_dim_len_prod = dim0; // product of kept dimensions + ck_tile::index_t total_reduce_elements = dim1 * dim2; // product of reduced dimensions + + RunGenericTest<3>(input_shape, + input_strides, + output_shape, + output_strides, + kept_dim_len_prod, + total_reduce_elements, + kept_dims, + reduce_dims); + } + + void RunTest3D_KeepDim01_ReduceDim2(ck_tile::index_t dim0, + ck_tile::index_t dim1, + ck_tile::index_t dim2) + { + constexpr auto kept_dims = ck_tile::sequence<0, 1>{}; + constexpr auto reduce_dims = ck_tile::sequence<2>{}; + + // Input shape and strides + std::vector input_shape = {dim0, dim1, dim2}; + std::vector input_strides = {dim1 * dim2, dim2, 1}; + + // Output shape and strides (keep dim0) + std::vector output_shape = {dim0, dim1}; + std::vector output_strides = {dim1, 1}; + + // Calculate products + ck_tile::index_t kept_dim_len_prod = dim0 * dim1; // product of kept dimensions + ck_tile::index_t total_reduce_elements = dim2; // product of reduced dimensions + + RunGenericTest<3>(input_shape, + input_strides, + output_shape, + output_strides, + kept_dim_len_prod, + total_reduce_elements, + kept_dims, + reduce_dims); + } + + void RunTest4D_KeepDim01_ReduceDim23(ck_tile::index_t N, + ck_tile::index_t C, + ck_tile::index_t H, + ck_tile::index_t W) + { + constexpr auto kept_dims = ck_tile::sequence<0, 1>{}; + constexpr auto reduce_dims = ck_tile::sequence<2, 3>{}; + + // Input shape and strides + std::vector input_shape = {N, C, H, W}; + std::vector input_strides = {C * H * W, H * W, W, 1}; + + // Output shape and strides (keep dim0, dim1) + std::vector output_shape = {N, C}; + std::vector output_strides = {C, 1}; + + // Calculate products + ck_tile::index_t kept_dim_len_prod = N * C; // product of kept dimensions + ck_tile::index_t total_reduce_elements = H * W; // product of reduced dimensions + + RunGenericTest<4>(input_shape, + input_strides, + output_shape, + output_strides, + kept_dim_len_prod, + total_reduce_elements, + kept_dims, + reduce_dims); + } + + void RunTest4D_KeepDim03_ReduceDim12(ck_tile::index_t N, + ck_tile::index_t H, + ck_tile::index_t W, + ck_tile::index_t C) + { + constexpr auto kept_dims = ck_tile::sequence<0, 3>{}; + constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; + + // Input shape and strides + std::vector input_shape = {N, H, W, C}; + std::vector input_strides = {H * W * C, W * C, C, 1}; + + // Output shape and strides (keep dim0, dim1) + std::vector output_shape = {N, C}; + std::vector output_strides = {C, 1}; + + // Calculate products + ck_tile::index_t kept_dim_len_prod = N * C; // product of kept dimensions + ck_tile::index_t total_reduce_elements = H * W; // product of reduced dimensions + + RunGenericTest<4>(input_shape, + input_strides, + output_shape, + output_strides, + kept_dim_len_prod, + total_reduce_elements, + kept_dims, + reduce_dims); + } +}; + +// Shape parameters for different test configurations +using Shape1_BlockWarps = ck_tile::sequence<4, 1>; +using Shape1_BlockTile = ck_tile::sequence<128, 128>; +using Shape1_WarpTile = ck_tile::sequence<32, 128>; +using Shape1_ThreadTile = ck_tile::sequence<8, 8>; + +using Shape2_BlockWarps = ck_tile::sequence<2, 2>; // Cross-warp reduction test +using Shape2_BlockTile = ck_tile::sequence<2, 1024>; +using Shape2_WarpTile = ck_tile::sequence<1, 512>; +using Shape2_ThreadTile = ck_tile::sequence<1, 8>; + +// Test configurations for different data types and operations +using TestConfig_F32_Add = std::tuple; + +using TestConfig_F16_Add = std::tuple; + +using TestConfig_F32_CrossWarp = std::tuple; + +using TestConfig_F32_Max = std::tuple; + +using TestConfig_F32_SquareAdd = std::tuple; + +using TestTypes = ::testing::Types; + +TYPED_TEST_SUITE(TestCkTileReduce, TestTypes); + +// 2D Tests - Keep dim0, reduce dim1 +TYPED_TEST(TestCkTileReduce, Test2D_KeepDim0_ReduceDim1_64x32) +{ + this->RunTest2D_KeepDim0_ReduceDim1(64, 32); +} + +TYPED_TEST(TestCkTileReduce, Test2D_KeepDim0_ReduceDim1_1024x512) +{ + this->RunTest2D_KeepDim0_ReduceDim1(1024, 512); +} + +// 3D Tests - Keep dim0, reduce dim1,2 +TYPED_TEST(TestCkTileReduce, Test3D_KeepDim0_ReduceDim12_128x128x1) +{ + this->RunTest3D_KeepDim0_ReduceDim12(128, 128, 8); +} +// 3D Tests - Keep dim0,1, reduce dim1 +TYPED_TEST(TestCkTileReduce, Test3D_KeepDim01_ReduceDim2_512x1024x16) +{ + this->RunTest3D_KeepDim01_ReduceDim2(512, 1024, 16); +} + +// 4D Tests - Keep dim0,1, reduce dim2,3 (NCHW -> NC) +TYPED_TEST(TestCkTileReduce, Test4D_KeepDim01_ReduceDim23_32x256x16x16) +{ + this->RunTest4D_KeepDim01_ReduceDim23(32, 256, 16, 16); +} +// 4D Tests - Keep dim0,3, reduce dim1,2 (NHWC -> NC) +TYPED_TEST(TestCkTileReduce, Test4D_KeepDim03_ReduceDim12_16x32x32x128) +{ + this->RunTest4D_KeepDim03_ReduceDim12(16, 32, 32, 128); +} From 1824d65758beeb6af10c02a2c35f959414348bc9 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Wed, 6 Aug 2025 10:15:44 -0700 Subject: [PATCH 13/21] modernize scripts for running cmake and clang-format (#2503) Co-authored-by: Aviral Goel --- script/clang-format-overwrite.sh | 5 +++++ script/cmake-ck-dev.sh | 3 +++ script/cmake-ck-release.sh | 3 +++ 3 files changed, 11 insertions(+) diff --git a/script/clang-format-overwrite.sh b/script/clang-format-overwrite.sh index a770970fef..ea2834ae62 100755 --- a/script/clang-format-overwrite.sh +++ b/script/clang-format-overwrite.sh @@ -1,2 +1,7 @@ +#!/bin/bash +set -euo pipefail +IFS=$'\n\t' + + find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-18 -i -style=file {}' git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|.hpp|.inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-18 -i -style=file {}' diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh index c45bb4330d..25a1590808 100755 --- a/script/cmake-ck-dev.sh +++ b/script/cmake-ck-dev.sh @@ -1,4 +1,7 @@ #!/bin/bash +set -euo pipefail +IFS=$'\n\t' + rm -f CMakeCache.txt rm -f *.cmake rm -rf CMakeFiles diff --git a/script/cmake-ck-release.sh b/script/cmake-ck-release.sh index 311ea91822..5263de92c8 100755 --- a/script/cmake-ck-release.sh +++ b/script/cmake-ck-release.sh @@ -1,4 +1,7 @@ #!/bin/bash +set -euo pipefail +IFS=$'\n\t' + rm -f CMakeCache.txt rm -f *.cmake rm -rf CMakeFiles From 5328b232b25cdf0989ba9ec5dbbda99e4933587c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Thu, 7 Aug 2025 08:36:47 +0200 Subject: [PATCH 14/21] Grouped Convolution Forward Infer Bias Bnorm Activ (#2621) * Grouped Convolution Forward Infer Bias Bnorm Activ * 3d --- .../gpu/element/element_wise_operation.hpp | 52 ++ .../device_operation_instance_factory.hpp | 47 +- ...ice_grouped_conv_fwd_xdl_comp_instance.hpp | 7 +- .../device_grouped_conv_fwd_xdl_instance.hpp | 7 +- ...ped_conv_fwd_xdl_large_tensor_instance.hpp | 7 +- ...vice_grouped_conv_fwd_xdl_mem_instance.hpp | 7 +- ...ed_conv_fwd_xdl_merged_groups_instance.hpp | 7 +- ...d_convolution_forward_bias_bnorm_clamp.hpp | 237 ++++++ ...nvolution_forward_bias_bnorm_clamp_xdl.inc | 776 ++++++++++++++++++ .../CMakeLists.txt | 240 ++++++ ...nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.in | 67 ++ ...dl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.in | 63 ++ ...gc_gkyxc_nhwgk_bf16_comp_part2_instance.in | 67 ++ ..._nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.in | 67 ++ ...xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.in | 63 ++ ...wgc_gkyxc_nhwgk_f16_comp_part2_instance.in | 67 ++ ...xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.in | 62 ++ ...l_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.in | 63 ++ ...amp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.in | 60 ++ ...dl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.in | 62 ++ ...lamp_xdl_nhwgc_gkyxc_nhwgk_f16_instance.in | 60 ++ ...dl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.in | 62 ++ ...lamp_xdl_nhwgc_gkyxc_nhwgk_f32_instance.in | 60 ++ ..._tensor_nhwgc_gkyxc_nhwgk_bf16_instance.in | 43 + ...e_tensor_nhwgc_gkyxc_nhwgk_f16_instance.in | 43 + ...e_tensor_nhwgc_gkyxc_nhwgk_f32_instance.in | 43 + ...wgc_gkyxc_nhwgk_bf16_mem_inter_instance.in | 63 ++ ...wgc_gkyxc_nhwgk_bf16_mem_intra_instance.in | 63 ++ ...hwgc_gkyxc_nhwgk_f16_mem_inter_instance.in | 63 ++ ...hwgc_gkyxc_nhwgk_f16_mem_intra_instance.in | 63 ++ ...hwgc_gkyxc_nhwgk_f32_mem_inter_instance.in | 63 ++ ...hwgc_gkyxc_nhwgk_f32_mem_intra_instance.in | 63 ++ ..._groups_nhwgc_gkyxc_nhwgk_bf16_instance.in | 79 ++ ...d_groups_nhwgc_gkyxc_nhwgk_f16_instance.in | 79 ++ ...d_groups_nhwgc_gkyxc_nhwgk_f32_instance.in | 53 ++ .../CMakeLists.txt | 240 ++++++ ...wgc_gkzyxc_ndhwgk_bf16_comp_2x_instance.in | 67 ++ ...ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in | 63 ++ ..._gkzyxc_ndhwgk_bf16_comp_part2_instance.in | 67 ++ ...hwgc_gkzyxc_ndhwgk_f16_comp_2x_instance.in | 67 ++ ..._ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in | 63 ++ ...c_gkzyxc_ndhwgk_f16_comp_part2_instance.in | 67 ++ ..._ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.in | 62 ++ ...dhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.in | 63 ++ ..._xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in | 60 ++ ...ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.in | 62 ++ ...p_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.in | 60 ++ ...ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.in | 62 ++ ...p_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.in | 60 ++ ...nsor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in | 43 + ...ensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.in | 43 + ...ensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.in | 43 + ...c_gkzyxc_ndhwgk_bf16_mem_inter_instance.in | 63 ++ ...c_gkzyxc_ndhwgk_bf16_mem_intra_instance.in | 63 ++ ...gc_gkzyxc_ndhwgk_f16_mem_inter_instance.in | 63 ++ ...gc_gkzyxc_ndhwgk_f16_mem_intra_instance.in | 63 ++ ...gc_gkzyxc_ndhwgk_f32_mem_inter_instance.in | 63 ++ ...gc_gkzyxc_ndhwgk_f32_mem_intra_instance.in | 63 ++ ...oups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in | 79 ++ ...roups_ndhwgc_gkzyxc_ndhwgk_f16_instance.in | 79 ++ ...roups_ndhwgc_gkzyxc_ndhwgk_f32_instance.in | 53 ++ ...grouped_conv_fwd_bias_bnorm_clamp_impl.hpp | 427 ++++++++++ .../CMakeLists.txt | 6 + ...st_grouped_convnd_fwd_bias_bnorm_clamp.cpp | 97 +++ ...grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp | 98 +++ 65 files changed, 5299 insertions(+), 38 deletions(-) create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp_xdl.inc create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/CMakeLists.txt create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/CMakeLists.txt create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.in create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.in create mode 100644 profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp create mode 100644 test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp create mode 100644 test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp index b57ae22172..089d4c2a9d 100644 --- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp @@ -562,6 +562,58 @@ struct NormalizeInInfer double epsilon_; }; +// used by Conv+Bias+BatchNorm+Clamp inference +struct BiasNormalizeInInferClamp +{ + BiasNormalizeInInferClamp(float floor = 0.f, + float ceil = NumericLimits::Max(), + float epsilon = 1e-4) + : clamp_(floor, ceil), epsilon_(epsilon) + { + } + + template + __host__ __device__ constexpr void operator()(T& y, + const T& x, + const T& bias, + const T& mean, + const T& variance, + const T& gamma, + const T& beta) const + { + using ck::type_convert; + using ck::math::sqrt; + + float tmp_x = type_convert(x) + type_convert(bias); + + float tmp_y = + ((tmp_x - type_convert(mean)) / sqrt(type_convert(variance) + epsilon_)) * + type_convert(gamma) + + type_convert(beta); + clamp_(tmp_y, tmp_y); + y = type_convert(tmp_y); + }; + + template <> + __host__ __device__ constexpr void operator()(float& y, + const float& x, + const float& bias, + const float& mean, + const float& variance, + const float& gamma, + const float& beta) const + { + using ck::type_convert; + using ck::math::sqrt; + + float tmp_y = (((x + bias) - mean) / sqrt(variance + epsilon_)) * gamma + beta; + clamp_(y, tmp_y); + }; + + Clamp clamp_; + float epsilon_; +}; + template struct UnaryTypeConvert; diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp index f6983810be..bf7f1b4fa4 100644 --- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp +++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp @@ -113,29 +113,30 @@ using GK_Tuple = ck::Tuple; using GK_GK_Tuple = ck::Tuple; // pointwise functor -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using Relu = ck::tensor_operation::element_wise::Relu; -using TanH = ck::tensor_operation::element_wise::TanH; -using Scale = ck::tensor_operation::element_wise::Scale; -using Bilinear = ck::tensor_operation::element_wise::Bilinear; -using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu; -using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu; -using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu; -using AddRelu = ck::tensor_operation::element_wise::AddRelu; -using AddClamp = ck::tensor_operation::element_wise::AddClamp; -using Clamp = ck::tensor_operation::element_wise::Clamp; -using AddSilu = ck::tensor_operation::element_wise::AddSilu; -using AddReluAdd = ck::tensor_operation::element_wise::AddReluAdd; -using FastGelu = ck::tensor_operation::element_wise::FastGelu; -using MultiplyFastGelu = ck::tensor_operation::element_wise::MultiplyFastGelu; -using AddMultiply = ck::tensor_operation::element_wise::AddMultiply; -using MultiplyAdd = ck::tensor_operation::element_wise::MultiplyAdd; -using MultiplyMultiply = ck::tensor_operation::element_wise::MultiplyMultiply; -using ScaleAdd = ck::tensor_operation::element_wise::ScaleAdd; -using Gelu = ck::tensor_operation::element_wise::Gelu; -using Swish = ck::tensor_operation::element_wise::Swish; -using Add = ck::tensor_operation::element_wise::Add; -using Multiply = ck::tensor_operation::element_wise::Multiply; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using Relu = ck::tensor_operation::element_wise::Relu; +using TanH = ck::tensor_operation::element_wise::TanH; +using Scale = ck::tensor_operation::element_wise::Scale; +using Bilinear = ck::tensor_operation::element_wise::Bilinear; +using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu; +using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu; +using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu; +using AddRelu = ck::tensor_operation::element_wise::AddRelu; +using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp; +using AddClamp = ck::tensor_operation::element_wise::AddClamp; +using Clamp = ck::tensor_operation::element_wise::Clamp; +using AddSilu = ck::tensor_operation::element_wise::AddSilu; +using AddReluAdd = ck::tensor_operation::element_wise::AddReluAdd; +using FastGelu = ck::tensor_operation::element_wise::FastGelu; +using MultiplyFastGelu = ck::tensor_operation::element_wise::MultiplyFastGelu; +using AddMultiply = ck::tensor_operation::element_wise::AddMultiply; +using MultiplyAdd = ck::tensor_operation::element_wise::MultiplyAdd; +using MultiplyMultiply = ck::tensor_operation::element_wise::MultiplyMultiply; +using ScaleAdd = ck::tensor_operation::element_wise::ScaleAdd; +using Gelu = ck::tensor_operation::element_wise::Gelu; +using Swish = ck::tensor_operation::element_wise::Swish; +using Add = ck::tensor_operation::element_wise::Add; +using Multiply = ck::tensor_operation::element_wise::Multiply; template using Activation_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp index fca236d03e..bbc2a54c34 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp @@ -32,9 +32,10 @@ using Empty_Tuple = ck::Tuple<>; using namespace ck::tensor_layout::convolution; -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using AddClamp = ck::tensor_operation::element_wise::AddClamp; -using Clamp = ck::tensor_operation::element_wise::Clamp; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp; +using AddClamp = ck::tensor_operation::element_wise::AddClamp; +using Clamp = ck::tensor_operation::element_wise::Clamp; static constexpr auto ConvFwdDefault = ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp index c641019b70..768fcbada0 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp @@ -32,9 +32,10 @@ using Empty_Tuple = ck::Tuple<>; using namespace ck::tensor_layout::convolution; -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using AddClamp = ck::tensor_operation::element_wise::AddClamp; -using Clamp = ck::tensor_operation::element_wise::Clamp; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp; +using AddClamp = ck::tensor_operation::element_wise::AddClamp; +using Clamp = ck::tensor_operation::element_wise::Clamp; static constexpr auto ConvFwdDefault = ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp index 3e98852d58..5a4a011512 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp @@ -24,9 +24,10 @@ using Empty_Tuple = ck::Tuple<>; using namespace ck::tensor_layout::convolution; -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using AddClamp = ck::tensor_operation::element_wise::AddClamp; -using Clamp = ck::tensor_operation::element_wise::Clamp; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp; +using AddClamp = ck::tensor_operation::element_wise::AddClamp; +using Clamp = ck::tensor_operation::element_wise::Clamp; static constexpr auto ConvFwdDefault = ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp index 4e6b9c3d1d..57bdeddcf9 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp @@ -32,9 +32,10 @@ using Empty_Tuple = ck::Tuple<>; using namespace ck::tensor_layout::convolution; -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using AddClamp = ck::tensor_operation::element_wise::AddClamp; -using Clamp = ck::tensor_operation::element_wise::Clamp; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp; +using AddClamp = ck::tensor_operation::element_wise::AddClamp; +using Clamp = ck::tensor_operation::element_wise::Clamp; static constexpr auto ConvFwdDefault = ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp index 7ef78d46e2..d07d82e7ee 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp @@ -24,9 +24,10 @@ using Empty_Tuple = ck::Tuple<>; using namespace ck::tensor_layout::convolution; -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using AddClamp = ck::tensor_operation::element_wise::AddClamp; -using Clamp = ck::tensor_operation::element_wise::Clamp; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp; +using AddClamp = ck::tensor_operation::element_wise::AddClamp; +using Clamp = ck::tensor_operation::element_wise::Clamp; static constexpr auto ConvFwdDefault = ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp.hpp new file mode 100644 index 0000000000..22cb7854a9 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp.hpp @@ -0,0 +1,237 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +#ifdef CK_USE_XDL +#include "grouped_convolution_forward_bias_bnorm_clamp_xdl.inc" +#endif + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +template +struct DeviceOperationInstanceFactory> +{ + using DeviceOp = DeviceGroupedConvFwdMultipleABD< + NumDimSpatial, + InLayout, + WeiLayout, + DLayouts, + OutLayout, + InDataType, + WeiDataType, + DDataTypes, + OutDataType, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::BiasNormalizeInInferClamp, + AComputeType, + BComputeType>; + + static auto GetInstances() + { + std::vector> op_ptrs; + +#ifdef CK_USE_XDL + // layout NHWGC/GKYXC/NHWGK + if constexpr(NumDimSpatial == 2 && is_same_v && + is_same_v && is_same_v) + { +#ifdef CK_ENABLE_BF16 + if constexpr(is_same_v && + is_same_v && + is_same_v && + is_same_v && + is_same_v) + { + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances( + op_ptrs); + } +#endif +#ifdef CK_ENABLE_FP16 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances( + op_ptrs); + } +#endif +#ifdef CK_ENABLE_FP32 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances( + op_ptrs); + } +#endif + } + // layout NDHWGC/GKZYXC/NDHWGK + if constexpr(NumDimSpatial == 3 && is_same_v && + is_same_v && is_same_v) + { +#ifdef CK_ENABLE_BF16 + if constexpr(is_same_v && + is_same_v && + is_same_v && + is_same_v && + is_same_v) + { + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances( + op_ptrs); + } +#endif +#ifdef CK_ENABLE_FP16 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances( + op_ptrs); + } +#endif +#ifdef CK_ENABLE_FP32 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances( + op_ptrs); + } +#endif + } +#endif // CK_USE_XDL + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp_xdl.inc new file mode 100644 index 0000000000..b11b428471 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp_xdl.inc @@ -0,0 +1,776 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +#ifdef CK_ENABLE_BF16 + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances( + std::vector< + std::unique_ptr, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances( + std::vector< + std::unique_ptr, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances( + std::vector< + std::unique_ptr, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances( + std::vector< + std::unique_ptr, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances( + std::vector< + std::unique_ptr, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances( + std::vector< + std::unique_ptr, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances( + std::vector< + std::unique_ptr, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances( + std::vector< + std::unique_ptr, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances( + std::vector< + std::unique_ptr, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances( + std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances( + std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances( + std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances( + std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances( + std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances( + std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +#endif + +#ifdef CK_ENABLE_FP16 + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances( + std::vector< + std::unique_ptr, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances( + std::vector< + std::unique_ptr, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances( + std::vector< + std::unique_ptr, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances( + std::vector< + std::unique_ptr, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances( + std::vector< + std::unique_ptr, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances( + std::vector< + std::unique_ptr, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances( + std::vector< + std::unique_ptr, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances( + std::vector< + std::unique_ptr, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances( + std::vector< + std::unique_ptr, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances( + std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances( + std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances( + std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances( + std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances( + std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances( + std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances( + std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances( + std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances( + std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +#endif + +#ifdef CK_ENABLE_FP32 + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances( + std::vector< + std::unique_ptr, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances( + std::vector< + std::unique_ptr, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances( + std::vector< + std::unique_ptr, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances( + std::vector< + std::unique_ptr, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances( + std::vector< + std::unique_ptr, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances( + std::vector< + std::unique_ptr, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances( + std::vector< + std::unique_ptr, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances( + std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances( + std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances( + std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances( + std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances( + std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances( + std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances( + std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +#endif + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/CMakeLists.txt new file mode 100644 index 0000000000..c06e4f5953 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/CMakeLists.txt @@ -0,0 +1,240 @@ +# ONLY XDL_KERNELS +set(GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP) +include(ShardInstantiation) + + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances + TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.in + NUM_SHARDS 16 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances + TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instance.in + NUM_SHARDS 16 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances + TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instance.in + NUM_SHARDS 16 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances + TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.in + NUM_SHARDS 4 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances + TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.in + NUM_SHARDS 4 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances + TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + # large tensor + # NHWGC, GKYXC, NHWGK + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances + TEMPLATE_FILE xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances + TEMPLATE_FILE xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances + TEMPLATE_FILE xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.in + NUM_SHARDS 2 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor +) + # merged groups + # NHWGC, GKYXC, NHWGK + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances + TEMPLATE_FILE xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances + TEMPLATE_FILE xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances + TEMPLATE_FILE xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups +) + #mem + # NHWGC, GKYXC, NHWGK + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.in + NUM_SHARDS 20 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.in + NUM_SHARDS 20 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.in + NUM_SHARDS 16 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + # NHWGC, GKYXC, NHWGK + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.in + NUM_SHARDS 20 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.in + NUM_SHARDS 20 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.in + NUM_SHARDS 16 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + #comp + # NHWGC, GKYXC, NHWGK + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.in + NUM_SHARDS 11 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.in + NUM_SHARDS 1 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.in + NUM_SHARDS 4 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.in + NUM_SHARDS 1 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.in + NUM_SHARDS 1 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.in + NUM_SHARDS 5 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.in + NUM_SHARDS 12 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +add_instance_library(device_grouped_conv2d_fwd_bias_bnorm_clamp_instance ${GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP}) diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.in new file mode 100644 index 0000000000..51a12c33bd --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.in @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances = std::vector, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances& instances) +{ + if(ck::get_device_name() == "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.in new file mode 100644 index 0000000000..22ee546ac8 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances = std::vector, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.in new file mode 100644 index 0000000000..632fee85a8 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.in @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances = std::vector, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances& instances) +{ + if(ck::get_device_name() != "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.in new file mode 100644 index 0000000000..50bbf761f1 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.in @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances = std::vector, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances& instances) +{ + if(ck::get_device_name() == "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.in new file mode 100644 index 0000000000..89baaff411 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances = std::vector, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.in new file mode 100644 index 0000000000..80a2655de6 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.in @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances = std::vector, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances& instances) +{ + if(ck::get_device_name() != "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.in new file mode 100644 index 0000000000..395885d03d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.in @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances = std::vector, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_comp_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_comp_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_comp_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.in new file mode 100644 index 0000000000..097254dc34 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances = std::vector, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_16x16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_16x16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_16x16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.in new file mode 100644 index 0000000000..7844440dd0 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.in @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances = std::vector, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.in new file mode 100644 index 0000000000..9db1750e8e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.in @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances = std::vector, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_16x16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_16x16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_16x16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instance.in new file mode 100644 index 0000000000..341fdf6eb6 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instance.in @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances = std::vector, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.in new file mode 100644 index 0000000000..bcb126392a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.in @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances = std::vector, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_16x16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_16x16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_16x16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instance.in new file mode 100644 index 0000000000..4e3a435e74 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instance.in @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances = std::vector, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.in new file mode 100644 index 0000000000..0956d9dd71 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.in @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances = std::vector, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_large_tensor_bf16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.in new file mode 100644 index 0000000000..b836dd8374 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.in @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances = std::vector, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_large_tensor_f16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.in new file mode 100644 index 0000000000..6b8cbf1704 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.in @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances = std::vector, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_large_tensor_f32_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.in new file mode 100644 index 0000000000..a2c36ee52b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances = std::vector, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.in new file mode 100644 index 0000000000..1c12ae66a3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances = std::vector, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.in new file mode 100644 index 0000000000..4fde5e662c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances = std::vector, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.in new file mode 100644 index 0000000000..d75c7f70d5 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances = std::vector, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.in new file mode 100644 index 0000000000..d51b3d01e3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances = std::vector, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.in new file mode 100644 index 0000000000..47135a2dd7 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances = std::vector, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.in new file mode 100644 index 0000000000..3e08e9668f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.in @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances = std::vector, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances& instances) +{ + if(ck::get_device_name() == "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd3x3,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } + else + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd3x3,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.in new file mode 100644 index 0000000000..ec76a8e1d1 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.in @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances = std::vector, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances& instances) +{ + if(ck::get_device_name() == "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd3x3,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } + else + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd3x3,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.in new file mode 100644 index 0000000000..2bbac89bbe --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.in @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances = std::vector, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd3x3,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/CMakeLists.txt new file mode 100644 index 0000000000..bda9149227 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/CMakeLists.txt @@ -0,0 +1,240 @@ +# ONLY XDL_KERNELS +set(GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP) +include(ShardInstantiation) + + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances + TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in + NUM_SHARDS 16 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances + TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.in + NUM_SHARDS 16 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances + TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.in + NUM_SHARDS 16 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances + TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.in + NUM_SHARDS 4 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances + TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.in + NUM_SHARDS 4 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances + TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + # large tensor + # NDHWGC, GKZYXC, NDHWGK + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances + TEMPLATE_FILE xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances + TEMPLATE_FILE xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances + TEMPLATE_FILE xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.in + NUM_SHARDS 2 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor +) + # merged groups + # NDHWGC, GKZYXC, NDHWGK + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances + TEMPLATE_FILE xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances + TEMPLATE_FILE xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances + TEMPLATE_FILE xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups +) + #mem + # NDHWGC, GKZYXC, NDHWGK + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.in + NUM_SHARDS 20 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.in + NUM_SHARDS 20 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.in + NUM_SHARDS 16 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + # NDHWGC, GKZYXC, NDHWGK + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.in + NUM_SHARDS 20 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.in + NUM_SHARDS 20 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.in + NUM_SHARDS 16 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + #comp + # NDHWGC, GKZYXC, NDHWGK + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in + NUM_SHARDS 11 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in + NUM_SHARDS 1 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.in + NUM_SHARDS 4 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instance.in + NUM_SHARDS 1 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instance.in + NUM_SHARDS 1 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instance.in + NUM_SHARDS 5 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instance.in + NUM_SHARDS 12 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +add_instance_library(device_grouped_conv3d_fwd_bias_bnorm_clamp_instance ${GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP}) diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instance.in new file mode 100644 index 0000000000..f397f0a810 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instance.in @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances = std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances& instances) +{ + if(ck::get_device_name() == "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in new file mode 100644 index 0000000000..d6aa4ea964 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances = std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instance.in new file mode 100644 index 0000000000..7c993f8b94 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instance.in @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances = std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances& instances) +{ + if(ck::get_device_name() != "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instance.in new file mode 100644 index 0000000000..fb41ec60f8 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instance.in @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances = std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances& instances) +{ + if(ck::get_device_name() == "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in new file mode 100644 index 0000000000..e1d581e4fd --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances = std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instance.in new file mode 100644 index 0000000000..99b48d51a0 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instance.in @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances = std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances& instances) +{ + if(ck::get_device_name() != "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.in new file mode 100644 index 0000000000..b172975635 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.in @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances = std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_comp_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_comp_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_comp_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.in new file mode 100644 index 0000000000..8ec8d9248f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances = std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_16x16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_16x16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_16x16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in new file mode 100644 index 0000000000..fb5c4159fd --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances = std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.in new file mode 100644 index 0000000000..a00fbf5342 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.in @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances = std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_16x16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_16x16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_16x16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.in new file mode 100644 index 0000000000..222ec0c2e0 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.in @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances = std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.in new file mode 100644 index 0000000000..8fbedb7793 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.in @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances = std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_16x16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_16x16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_16x16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.in new file mode 100644 index 0000000000..c538d50fc9 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.in @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances = std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in new file mode 100644 index 0000000000..be76a48480 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances = std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_large_tensor_bf16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.in new file mode 100644 index 0000000000..dcfdb984c2 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.in @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances = std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_large_tensor_f16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.in new file mode 100644 index 0000000000..ed1988cdf4 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.in @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances = std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_large_tensor_f32_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.in new file mode 100644 index 0000000000..83af7e09ce --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances = std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.in new file mode 100644 index 0000000000..ce83cb566a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances = std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.in new file mode 100644 index 0000000000..051aaf7cf3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances = std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.in new file mode 100644 index 0000000000..6fa3709cc6 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances = std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.in new file mode 100644 index 0000000000..2ba3e4ec93 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances = std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.in new file mode 100644 index 0000000000..c4d33236af --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances = std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in new file mode 100644 index 0000000000..6a902ed72d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances = std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances& instances) +{ + if(ck::get_device_name() == "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd3x3,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } + else + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd3x3,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.in new file mode 100644 index 0000000000..b8125423bc --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.in @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances = std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances& instances) +{ + if(ck::get_device_name() == "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd3x3,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } + else + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd3x3,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.in new file mode 100644 index 0000000000..f292d95cda --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.in @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances = std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd3x3,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp new file mode 100644 index 0000000000..43bab919b4 --- /dev/null +++ b/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp @@ -0,0 +1,427 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp.hpp" + +#include "ck/library/utility/algorithm.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/convolution_parameter.hpp" +#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp" + +namespace ck { +namespace profiler { + +using InElementOp = ck::tensor_operation::element_wise::PassThrough; +using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; +using OutElementOp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp; +using Clamp = ck::tensor_operation::element_wise::Clamp; +using Add = ck::tensor_operation::element_wise::Add; + +// NOTE: Usage of NHWGK layout for GK bias is a workaround. This test is to +// just keep such implementation valid. +// TODO: Add possiblity to pass GK layout and GK lengths for bias and reuse +// the same instances. + +template +auto get_elementwise_desc(ck::index_t G, ck::index_t K) +{ + if constexpr(NDimSpatial == 1) + { + return HostTensorDescriptor({G, 1, K, 1}, {K, 0, 1, 0}); + } + else if constexpr(NDimSpatial == 2) + { + return HostTensorDescriptor({G, 1, K, 1, 1}, {K, 0, 1, 0, 0}); + } + else + { + return HostTensorDescriptor({G, 1, K, 1, 1, 1}, {K, 0, 1, 0, 0, 0}); + } +} + +template +void ref_bnorm_clamp_infer(Tensor& out, + Tensor& in, + Tensor& mean, + Tensor& variance, + Tensor& scale, + Tensor& shift, + const float floor, + const float ceil, + const float epsilon) +{ + + auto func = [&](auto... idxs) { + const float x = type_convert(in(idxs...)); + + const float invVariance = + type_convert(1.0f) / std::sqrt(epsilon + type_convert(variance(idxs...))); + + const float norm_x = (x - type_convert(mean(idxs...))) * invVariance; + + float y = + type_convert(scale(idxs...)) * norm_x + type_convert(shift(idxs...)); + + Clamp{floor, ceil}(y, y); + + out(idxs...) = type_convert(y); + }; + if constexpr(NDimSpatial == 1) + { + make_ParallelTensorFunctor(func, + out.GetLengths()[0], + out.GetLengths()[1], + out.GetLengths()[2], + out.GetLengths()[3])(std::thread::hardware_concurrency()); + } + else if constexpr(NDimSpatial == 2) + { + make_ParallelTensorFunctor(func, + out.GetLengths()[0], + out.GetLengths()[1], + out.GetLengths()[2], + out.GetLengths()[3], + out.GetLengths()[4])(std::thread::hardware_concurrency()); + } + else + { + make_ParallelTensorFunctor(func, + out.GetLengths()[0], + out.GetLengths()[1], + out.GetLengths()[2], + out.GetLengths()[3], + out.GetLengths()[4], + out.GetLengths()[5])(std::thread::hardware_concurrency()); + } +} + +template +bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification, + int init_method, + bool do_log, + bool time_kernel, + const ck::utils::conv::ConvParam& conv_param) +{ + const float floor = 0.f; + const float ceil = 2048.f; + const float epsilon = 1e-4; + + const auto in_element_op = InElementOp{}; + const auto wei_element_op = WeiElementOp{}; + const auto out_element_op = OutElementOp{floor, ceil, epsilon}; + + const auto in_g_n_c_wis_desc = + ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed(conv_param); + + const auto wei_g_k_c_xs_desc = + ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed(conv_param); + + const auto out_g_n_k_wos_desc = + ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed(conv_param); + + const index_t G = conv_param.G_; + const index_t K = conv_param.K_; + + std::array a_g_n_c_wis_lengths{}; + std::array a_g_n_c_wis_strides{}; + std::array b_g_k_c_xs_lengths{}; + std::array b_g_k_c_xs_strides{}; + std::array e_g_n_k_wos_lengths{}; + std::array e_g_n_k_wos_strides{}; + std::array d_g_n_k_wos_strides{}; + std::array conv_filter_strides{}; + std::array conv_filter_dilations{}; + std::array input_left_pads{}; + std::array input_right_pads{}; + + auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); }; + + copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths); + copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides); + copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths); + copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides); + copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths); + copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides); + copy(out_g_n_k_wos_desc.GetStrides(), d_g_n_k_wos_strides); + copy(conv_param.conv_filter_strides_, conv_filter_strides); + copy(conv_param.conv_filter_dilations_, conv_filter_dilations); + copy(conv_param.input_left_pads_, input_left_pads); + copy(conv_param.input_right_pads_, input_right_pads); + + Tensor input(in_g_n_c_wis_desc); + Tensor weight(wei_g_k_c_xs_desc); + Tensor host_output(out_g_n_k_wos_desc); + Tensor device_output(out_g_n_k_wos_desc); + const auto elementwise_desc = + ElementwiseGK ? get_elementwise_desc(G, K) : out_g_n_k_wos_desc; + + Tensor bias(elementwise_desc); + Tensor mean(elementwise_desc); + Tensor variance(elementwise_desc); + Tensor scale(elementwise_desc); + Tensor shift(elementwise_desc); + + std::cout << "input: " << input.mDesc << std::endl; + std::cout << "weight: " << weight.mDesc << std::endl; + std::cout << "output: " << host_output.mDesc << std::endl; + + std::cout << "bias: " << bias.mDesc << std::endl; + std::cout << "mean: " << mean.mDesc << std::endl; + std::cout << "variance: " << variance.mDesc << std::endl; + std::cout << "scale: " << scale.mDesc << std::endl; + std::cout << "shift: " << shift.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + input.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + weight.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + + bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + mean.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + variance.GenerateTensorValue(GeneratorTensor_2{0, 5}); + scale.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + shift.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + default: + input.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + weight.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + + bias.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + mean.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + variance.GenerateTensorValue(GeneratorTensor_3{0, 0.5}); + scale.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + shift.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize()); + DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize()); + DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize()); + + const std::size_t elementwise_dev_buf_size = + ElementwiseGK ? sizeof(OutDataType) * G * K + : sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize(); + DeviceMem bias_device_buf(elementwise_dev_buf_size); + DeviceMem mean_device_buf(elementwise_dev_buf_size); + DeviceMem variance_device_buf(elementwise_dev_buf_size); + DeviceMem scale_device_buf(elementwise_dev_buf_size); + DeviceMem shift_device_buf(elementwise_dev_buf_size); + + in_device_buf.ToDevice(input.mData.data()); + wei_device_buf.ToDevice(weight.mData.data()); + + bias_device_buf.ToDevice(bias.mData.data()); + mean_device_buf.ToDevice(mean.mData.data()); + variance_device_buf.ToDevice(variance.mData.data()); + scale_device_buf.ToDevice(scale.mData.data()); + shift_device_buf.ToDevice(shift.mData.data()); + + if constexpr(ElementwiseGK) + { + constexpr ck::index_t spatial_offset = 3; + d_g_n_k_wos_strides[1] = 0; + for(int i = 0; i < NDimSpatial; i++) + { + d_g_n_k_wos_strides[i + spatial_offset] = 0; + } + } + + // run reference op + if(do_verification) + { + // Run Conv and Bnorm seperatly + auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd{}; + + std::array, 1> d_tensors = {bias}; + auto ref_conv_invoker = ref_conv.MakeInvoker(); + auto ref_conv_argument = ref_conv.MakeArgument(input, + weight, + host_output, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_, + conv_param.input_right_pads_, + in_element_op, + wei_element_op, + Add{}, + {}, + {}, + d_tensors); + + // init host output to zero + host_output.SetZero(); + ref_conv_invoker.Run(ref_conv_argument); + ref_bnorm_clamp_infer( + host_output, host_output, mean, variance, scale, shift, floor, ceil, epsilon); + } + + std::string best_op_name; + float best_avg_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + + // profile device op instances + bool pass = true; + + auto run_impl = [&](auto& op_ptr, auto& argument_ptr) { + // workspace_sz will be equal to 0 for other layout than NGCHW + const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get()); + DeviceMem workspace_dev(workspace_sz); + op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer()); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + // re-init output to zero before profiling next kernel + out_device_buf.SetZero(); + + std::string op_name = op_ptr->GetTypeString(); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + float avg_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + std::size_t flop = conv_param.GetFlops(); + std::size_t num_btype = conv_param.GetByte(); + + float tflops = static_cast(flop) / 1.E9 / avg_time; + + float gb_per_sec = num_btype / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + best_op_name = op_name; + best_tflops = tflops; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + } + + if(do_verification) + { + out_device_buf.FromDevice(device_output.mData.data()); + + pass = pass & ck::utils::check_err(device_output, host_output); + + if(do_log) + { + LogRangeAsType(std::cout << "input : ", input.mData, ",") << std::endl; + LogRangeAsType(std::cout << "weight: ", weight.mData, ",") << std::endl; + LogRangeAsType(std::cout << "host_output : ", host_output.mData, ",") + << std::endl; + LogRangeAsType(std::cout << "device_output: ", device_output.mData, ",") + << std::endl; + } + } + } + else + { + std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl; + } + }; + + using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD< + NDimSpatial, + InLayout, + WeiLayout, + ck::Tuple, + OutLayout, + InDataType, + WeiDataType, + ck::Tuple, + OutDataType, + InElementOp, + WeiElementOp, + OutElementOp, + AComputeType, + BComputeType>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl; + + for(auto& op_ptr : op_ptrs) + { + auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(), + wei_device_buf.GetDeviceBuffer(), + {bias_device_buf.GetDeviceBuffer(), + mean_device_buf.GetDeviceBuffer(), + variance_device_buf.GetDeviceBuffer(), + scale_device_buf.GetDeviceBuffer(), + shift_device_buf.GetDeviceBuffer()}, + out_device_buf.GetDeviceBuffer(), + a_g_n_c_wis_lengths, + a_g_n_c_wis_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + {e_g_n_k_wos_lengths, + e_g_n_k_wos_lengths, + e_g_n_k_wos_lengths, + e_g_n_k_wos_lengths, + e_g_n_k_wos_lengths}, + {d_g_n_k_wos_strides, + d_g_n_k_wos_strides, + d_g_n_k_wos_strides, + d_g_n_k_wos_strides, + d_g_n_k_wos_strides}, + e_g_n_k_wos_lengths, + e_g_n_k_wos_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + in_element_op, + wei_element_op, + out_element_op); + + run_impl(op_ptr, argument_ptr); + } + + std::cout << "Best configuration parameters:" + << "\nname: " << best_op_name << "\navg_time: " << best_avg_time + << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl; + + return pass; +} + +} // namespace profiler +} // namespace ck diff --git a/test/grouped_convnd_fwd_activation/CMakeLists.txt b/test/grouped_convnd_fwd_activation/CMakeLists.txt index f964325c06..4d5196505c 100644 --- a/test/grouped_convnd_fwd_activation/CMakeLists.txt +++ b/test/grouped_convnd_fwd_activation/CMakeLists.txt @@ -1,4 +1,10 @@ if(GPU_TARGETS MATCHES "gfx9") + add_gtest_executable(test_grouped_convnd_fwd_bias_bnorm_clamp test_grouped_convnd_fwd_bias_bnorm_clamp.cpp) + target_link_libraries(test_grouped_convnd_fwd_bias_bnorm_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_bnorm_clamp_instance device_grouped_conv3d_fwd_bias_bnorm_clamp_instance) + + add_gtest_executable(test_grouped_convnd_fwd_gk_bias_bnorm_clamp test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp) + target_link_libraries(test_grouped_convnd_fwd_gk_bias_bnorm_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_bnorm_clamp_instance device_grouped_conv3d_fwd_bias_bnorm_clamp_instance) + add_gtest_executable(test_grouped_convnd_fwd_bias_clamp test_grouped_convnd_fwd_bias_clamp.cpp) target_link_libraries(test_grouped_convnd_fwd_bias_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_clamp_instance device_grouped_conv3d_fwd_bias_clamp_instance) diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp new file mode 100644 index 0000000000..bf96d11d53 --- /dev/null +++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp" + +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp; + +template +class TestGroupedConvndFwd : public ::testing::Test +{ + protected: + using DataType = std::tuple_element_t<0, Tuple>; + using InLayout = std::tuple_element_t<1, Tuple>; + using WeiLayout = std::tuple_element_t<2, Tuple>; + using OutLayout = std::tuple_element_t<3, Tuple>; + using IndexType = ck::index_t; + + std::vector conv_params; + + template + void Run() + { + EXPECT_FALSE(conv_params.empty()); + bool pass = true; + for(auto& param : conv_params) + { + pass = pass && ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl( + true, // do_verification + 1, // init_method: integer value + false, // do_log + false, // time_kernel + param); + } + EXPECT_TRUE(pass); + } +}; + +using namespace ck::tensor_layout::convolution; + +using KernelTypes2d = ::testing::Types, + std::tuple, + std::tuple>; + +using KernelTypes3d = ::testing::Types, + std::tuple, + std::tuple>; + +template +class TestGroupedConvndFwd2d : public TestGroupedConvndFwd +{ +}; + +template +class TestGroupedConvndFwd3d : public TestGroupedConvndFwd +{ +}; + +TYPED_TEST_SUITE(TestGroupedConvndFwd2d, KernelTypes2d); +TYPED_TEST_SUITE(TestGroupedConvndFwd3d, KernelTypes3d); + +TYPED_TEST(TestGroupedConvndFwd2d, Test2D) +{ + this->conv_params.clear(); + this->conv_params.push_back( + {2, 2, 32, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}}); + this->conv_params.push_back( + {2, 2, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->template Run<2>(); +} + +TYPED_TEST(TestGroupedConvndFwd3d, Test3D) +{ + this->conv_params.clear(); + this->conv_params.push_back( + {3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); + this->conv_params.push_back( + {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); + this->template Run<3>(); +} diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp new file mode 100644 index 0000000000..2400008ffa --- /dev/null +++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp" + +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp; + +template +class TestGroupedConvndFwd : public ::testing::Test +{ + protected: + using DataType = std::tuple_element_t<0, Tuple>; + using InLayout = std::tuple_element_t<1, Tuple>; + using WeiLayout = std::tuple_element_t<2, Tuple>; + using OutLayout = std::tuple_element_t<3, Tuple>; + using IndexType = ck::index_t; + + std::vector conv_params; + + template + void Run() + { + EXPECT_FALSE(conv_params.empty()); + bool pass = true; + for(auto& param : conv_params) + { + pass = pass && + ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl( + true, // do_verification + 1, // init_method: integer value + false, // do_log + false, // time_kernel + param); + } + EXPECT_TRUE(pass); + } +}; + +using namespace ck::tensor_layout::convolution; + +using KernelTypes2d = ::testing::Types, + std::tuple, + std::tuple>; + +using KernelTypes3d = ::testing::Types, + std::tuple, + std::tuple>; + +template +class TestGroupedConvndFwd2d : public TestGroupedConvndFwd +{ +}; + +template +class TestGroupedConvndFwd3d : public TestGroupedConvndFwd +{ +}; + +TYPED_TEST_SUITE(TestGroupedConvndFwd2d, KernelTypes2d); +TYPED_TEST_SUITE(TestGroupedConvndFwd3d, KernelTypes3d); + +TYPED_TEST(TestGroupedConvndFwd2d, Test2D) +{ + this->conv_params.clear(); + this->conv_params.push_back( + {2, 2, 32, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}}); + this->conv_params.push_back( + {2, 2, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->template Run<2>(); +} + +TYPED_TEST(TestGroupedConvndFwd3d, Test3D) +{ + this->conv_params.clear(); + this->conv_params.push_back( + {3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); + this->conv_params.push_back( + {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); + this->template Run<3>(); +} From 54c7e08a2f7624409c9b2f7804e2a095079c89e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Thu, 7 Aug 2025 10:00:09 +0200 Subject: [PATCH 15/21] Fix clang format after conv changes (#2636) --- .../profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp index 43bab919b4..cd6c141219 100644 --- a/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp @@ -279,8 +279,8 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification, in_element_op, wei_element_op, Add{}, - {}, - {}, + {}, + {}, d_tensors); // init host output to zero @@ -416,9 +416,9 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification, run_impl(op_ptr, argument_ptr); } - std::cout << "Best configuration parameters:" - << "\nname: " << best_op_name << "\navg_time: " << best_avg_time - << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl; + std::cout << "Best configuration parameters:" << "\nname: " << best_op_name + << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops + << "\nGB/s: " << best_gb_per_sec << std::endl; return pass; } From 21e9983913657f2270e31a9d301c4b9a55c502ac Mon Sep 17 00:00:00 2001 From: Enrico Degregori <73224202+EnricoDeg@users.noreply.github.com> Date: Thu, 7 Aug 2025 12:30:08 +0200 Subject: [PATCH 16/21] Revert "Add padding to 1x1Stride1Pad0 conv specialization (grouped conv bwd weight) (#2610)" (#2637) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 2203b0ddfe06f4f9f5126e54e78697dfb16118d4. Co-authored-by: Bartłomiej Kocot --- include/ck/ck.hpp | 3 + ...rouped_conv_bwd_weight_xdl_cshuffle_v3.hpp | 11 +- .../gridwise_gemm_xdl_cshuffle_conv_v3.hpp | 198 ------------------ .../transform_conv_bwd_weight_to_gemm.hpp | 126 +++++++---- .../transform_conv_bwd_weight_to_gemm_v2.hpp | 120 +++++++---- 5 files changed, 168 insertions(+), 290 deletions(-) diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp index 09801203ba..794c6f4e20 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -222,6 +222,9 @@ // TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread" #define CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0 +// workaround: conv crash when K, C is even +#define CK_WORKAROUND_DISABLE_FILTER1x1STRIDE1PAD0_WHEN_K_C_IS_EVEN 1 + // workaround: compiler crash when compiling recursive lambda #define CK_WORKAROUND_SWDEV_275126 1 diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp index ed64b83356..1cd1f16245 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp @@ -331,8 +331,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3 using CGridDesc_M_N = remove_cvref_t; using GridwiseGemm = GridwiseGemm_xdl_cshuffle_conv_v3< - tensor_layout::gemm::ColumnMajor, tensor_layout::gemm::RowMajor, + tensor_layout::gemm::ColumnMajor, tensor_layout::gemm::RowMajor, ADataType, BDataType, @@ -1299,6 +1299,13 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3 if constexpr(ConvBackwardWeightSpecialization == ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0) { +// workaround: disable when K, C is even +#if CK_WORKAROUND_DISABLE_FILTER1x1STRIDE1PAD0_WHEN_K_C_IS_EVEN + if(arg.Conv_C_ % 2 == 0 || arg.Conv_K_ % 2 == 0) + { + return false; + } +#endif // check if it's 1x1, stride=1 pad = 0 conv for(int i = 0; i < NDimSpatial; i++) { @@ -1323,7 +1330,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3 } // Gridwise GEMM size - return GridwiseGemm::CheckValidity(gemm_arg); + return true; } bool IsSupportedArgument(const BaseArgument* p_arg) override diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp index 382d2870e8..68112489ca 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp @@ -4,7 +4,6 @@ #pragma once #include "ck/utility/common_header.hpp" -#include "ck/utility/env.hpp" #include "ck/tensor_description/multi_index_transform_helper.hpp" #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" @@ -607,203 +606,6 @@ struct GridwiseGemm_xdl_cshuffle_conv_v3 c_block_size * sizeof(CShuffleDataType)); } - // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} - __host__ static constexpr bool CheckValidity(const Argument& karg) - { - static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) && - (NPerBlock % (NXdlPerWave * NPerXdl)) == 0, - "Invalid tuning param!"); - - if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding || - GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding || - GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding || - GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) && - !(is_same::value)) - { - if(!(karg.M % MPerBlock == 0)) - { - if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) - { - std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " " - << __FILE__ << ":" << __LINE__ << ", in function: " << __func__ - << std::endl; - } - return false; - } - } - - if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding || - GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding || - GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding || - GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) && - (is_same::value)) - { - if(!(karg.N % NPerBlock == 0)) - { - if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) - { - std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " " - << __FILE__ << ":" << __LINE__ << ", in function: " << __func__ - << std::endl; - } - return false; - } - } - - if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::KPadding || - GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding || - GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding || - GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding)) - { - - auto K_t = karg.KBatch * KPerBlock; - if(!(karg.K % K_t == 0)) - { - if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) - { - std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: " - << karg.K << " " << __FILE__ << ":" << __LINE__ - << ", in function: " << __func__ << std::endl; - } - return false; - } - } - else - { - constexpr auto KReadVec = math::lcm(AK1Number, BK1Number); - auto K_t = karg.KBatch * KReadVec; - auto KReadPadSplited = math::integer_divide_ceil(karg.K, K_t) * KReadVec; - if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K) - { - return false; - } - } - - if constexpr(is_same::value) - { - if(karg.K % ABlockTransferSrcScalarPerVector != 0) - { - if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) - { - std::cout << "Arg K (" << karg.K - << ") value is not a multiple of ABlockTransferSrcScalarPerVector (" - << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":" - << __LINE__ << ", in function: " << __func__ << std::endl; - } - return false; - } - } - else - { - if(karg.M % ABlockTransferSrcScalarPerVector != 0) - { - if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) - { - std::cout << "Arg M (" << karg.M - << ") value is not a multiple of ABlockTransferSrcScalarPerVector (" - << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":" - << __LINE__ << ", in function: " << __func__ << std::endl; - } - return false; - } - } - - if constexpr(is_same::value) - { - if(karg.N % BBlockTransferSrcScalarPerVector != 0) - { - if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) - { - std::cout << "Arg N (" << karg.N - << ") value is not a multiple of BBlockTransferSrcScalarPerVector (" - << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":" - << __LINE__ << ", in function: " << __func__ << std::endl; - } - return false; - } - } - else - { - if(karg.K % BBlockTransferSrcScalarPerVector != 0) - { - if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) - { - std::cout << "Arg K (" << karg.K - << ") value is not a multiple of BBlockTransferSrcScalarPerVector (" - << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":" - << __LINE__ << ", in function: " << __func__ << std::endl; - } - return false; - } - } - - if constexpr(is_same::value) - { - if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0) - { - if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) - { - std::cout << "Arg N (" << karg.N - << ") value is not a multiple of " - "CShuffleBlockTransferScalarPerVector_NPerBlock (" - << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! " - << __FILE__ << ":" << __LINE__ << ", in function: " << __func__ - << std::endl; - } - return false; - } - } - else - { - if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0) - { - if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) - { - std::cout << "Arg M (" << karg.M - << ") value is not a multiple of " - "CShuffleBlockTransferScalarPerVector_NPerBlock (" - << CShuffleBlockTransferScalarPerVector_NPerBlock << " )! " - << __FILE__ << ":" << __LINE__ << ", in function: " << __func__ - << std::endl; - } - return false; - } - } - - if constexpr(!(is_same, half_t>::value || - is_same, float>::value || - is_same, bhalf_t>::value || - is_same, int32_t>::value)) - { - if(!karg.IsReduceAdd()) - { - if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) - { - std::cout << " KBatch: " << karg.KBatch << " > 1 is not support yet" << __FILE__ - << ":" << __LINE__ << ", in function: " << __func__ << std::endl; - } - if(karg.KBatch > 1) - { - return false; - } - } - } - - // check gridwise gemm pipeline - const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value); - - if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1) - { - if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages) - { - return false; - } - } - - // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc) - return true; - } - __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K) { const index_t num_loop = K / KPerBlock; diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp index efc7f20cdc..bd3ab10802 100644 --- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp +++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp @@ -192,7 +192,7 @@ struct TransformConvBwdWeightToGemm const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_right_pad_transform(GemmM, PadGemmM)), + make_pass_through_transform(GemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); @@ -210,7 +210,7 @@ struct TransformConvBwdWeightToGemm const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_right_pad_transform(GemmN, PadGemmN)), + make_pass_through_transform(GemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); @@ -218,17 +218,9 @@ struct TransformConvBwdWeightToGemm const auto wei_gemmm_gemmn_grid_desc = make_naive_tensor_descriptor_packed(make_tuple(K, X * C)); - // Padd - const auto wei_gemmm_gemmn_pad_grid_desc = - transform_tensor_descriptor(wei_gemmm_gemmn_grid_desc, - make_tuple(make_right_pad_transform(GemmM, PadGemmM), - make_right_pad_transform(GemmN, PadGemmN)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, - wei_gemmm_gemmn_pad_grid_desc); + wei_gemmm_gemmn_grid_desc); } else { @@ -248,7 +240,7 @@ struct TransformConvBwdWeightToGemm const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_right_pad_transform(GemmM, PadGemmM)), + make_pass_through_transform(GemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); @@ -287,7 +279,7 @@ struct TransformConvBwdWeightToGemm const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_right_pad_transform(GemmN, PadGemmN)), + make_pass_through_transform(GemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); @@ -296,6 +288,26 @@ struct TransformConvBwdWeightToGemm make_naive_tensor_descriptor_packed(make_tuple(K, X * C)); // Padd + const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc = + transform_tensor_descriptor( + out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, + make_tuple(make_pass_through_transform(GemmKBatch), + make_pass_through_transform(GemmK0), + make_right_pad_transform(GemmM, PadGemmM), + make_pass_through_transform(GemmK1Number)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + + const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc = + transform_tensor_descriptor( + in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, + make_tuple(make_pass_through_transform(GemmKBatch), + make_pass_through_transform(GemmK0), + make_right_pad_transform(GemmN, PadGemmN), + make_pass_through_transform(GemmK1Number)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + const auto wei_gemmm_gemmn_pad_grid_desc = transform_tensor_descriptor(wei_gemmm_gemmn_grid_desc, make_tuple(make_right_pad_transform(GemmM, PadGemmM), @@ -303,8 +315,8 @@ struct TransformConvBwdWeightToGemm make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, - in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, + return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc, + in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc, wei_gemmm_gemmn_pad_grid_desc); } } @@ -380,7 +392,7 @@ struct TransformConvBwdWeightToGemm const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_right_pad_transform(GemmM, PadGemmM)), + make_pass_through_transform(GemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); @@ -395,21 +407,13 @@ struct TransformConvBwdWeightToGemm const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_right_pad_transform(GemmN, PadGemmN)), + make_pass_through_transform(GemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); - // Padd - const auto wei_gemmm_gemmn_pad_grid_desc = - transform_tensor_descriptor(wei_grid_desc, - make_tuple(make_right_pad_transform(GemmM, PadGemmM), - make_right_pad_transform(GemmN, PadGemmN)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, - wei_gemmm_gemmn_pad_grid_desc); + wei_grid_desc); } else { @@ -424,7 +428,7 @@ struct TransformConvBwdWeightToGemm const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_right_pad_transform(GemmM, PadGemmM)), + make_pass_through_transform(GemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); @@ -465,11 +469,31 @@ struct TransformConvBwdWeightToGemm const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_right_pad_transform(GemmN, PadGemmN)), + make_pass_through_transform(GemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); // Padd + const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc = + transform_tensor_descriptor( + out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, + make_tuple(make_pass_through_transform(GemmKBatch), + make_pass_through_transform(GemmK0), + make_right_pad_transform(GemmM, PadGemmM), + make_pass_through_transform(GemmK1Number)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + + const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc = + transform_tensor_descriptor( + in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, + make_tuple(make_pass_through_transform(GemmKBatch), + make_pass_through_transform(GemmK0), + make_right_pad_transform(GemmN, PadGemmN), + make_pass_through_transform(GemmK1Number)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + const auto wei_gemmm_gemmn_pad_grid_desc = transform_tensor_descriptor(wei_grid_desc, make_tuple(make_right_pad_transform(GemmM, PadGemmM), @@ -477,8 +501,8 @@ struct TransformConvBwdWeightToGemm make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, - in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, + return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc, + in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc, wei_gemmm_gemmn_pad_grid_desc); } } @@ -561,7 +585,7 @@ struct TransformConvBwdWeightToGemm const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_right_pad_transform(GemmM, PadGemmM)), + make_pass_through_transform(GemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); @@ -576,21 +600,13 @@ struct TransformConvBwdWeightToGemm const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_right_pad_transform(GemmN, PadGemmN)), + make_pass_through_transform(GemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); - // Padd - const auto wei_gemmm_gemmn_pad_grid_desc = - transform_tensor_descriptor(wei_grid_desc, - make_tuple(make_right_pad_transform(GemmM, PadGemmM), - make_right_pad_transform(GemmN, PadGemmN)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, - wei_gemmm_gemmn_pad_grid_desc); + wei_grid_desc); } else { @@ -605,7 +621,7 @@ struct TransformConvBwdWeightToGemm const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_right_pad_transform(GemmM, PadGemmM)), + make_pass_through_transform(GemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); @@ -655,11 +671,31 @@ struct TransformConvBwdWeightToGemm const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)), - make_right_pad_transform(GemmN, PadGemmN)), + make_pass_through_transform(GemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); // Padd + const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc = + transform_tensor_descriptor( + out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, + make_tuple(make_pass_through_transform(GemmKBatch), + make_pass_through_transform(GemmK0), + make_right_pad_transform(GemmM, PadGemmM), + make_pass_through_transform(GemmK1Number)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + + const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc = + transform_tensor_descriptor( + in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, + make_tuple(make_pass_through_transform(GemmKBatch), + make_pass_through_transform(GemmK0), + make_right_pad_transform(GemmN, PadGemmN), + make_pass_through_transform(GemmK1Number)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + const auto wei_gemmm_gemmn_pad_grid_desc = transform_tensor_descriptor(wei_grid_desc, make_tuple(make_right_pad_transform(GemmM, PadGemmM), @@ -667,8 +703,8 @@ struct TransformConvBwdWeightToGemm make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, - in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, + return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc, + in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc, wei_gemmm_gemmn_pad_grid_desc); } } // function end diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp index e410f06190..b72ddb8243 100644 --- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp +++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp @@ -374,7 +374,7 @@ struct TransformConvBwdWeightToGemmV2 const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_right_pad_transform(GemmM, PadGemmM)), + make_pass_through_transform(GemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); @@ -390,21 +390,13 @@ struct TransformConvBwdWeightToGemmV2 const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_right_pad_transform(GemmN, PadGemmN)), + make_pass_through_transform(GemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - // Padd - const auto wei_gemmm_gemmn_pad_grid_desc = - transform_tensor_descriptor(wei_grid_desc, - make_tuple(make_right_pad_transform(GemmM, PadGemmM), - make_right_pad_transform(GemmN, PadGemmN)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, - wei_gemmm_gemmn_pad_grid_desc); + wei_grid_desc); } else { @@ -420,7 +412,7 @@ struct TransformConvBwdWeightToGemmV2 const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_right_pad_transform(GemmM, PadGemmM)), + make_pass_through_transform(GemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); @@ -461,11 +453,29 @@ struct TransformConvBwdWeightToGemmV2 const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_right_pad_transform(GemmN, PadGemmN)), + make_pass_through_transform(GemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); // Padd + const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc = + transform_tensor_descriptor( + out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, + make_tuple(make_pass_through_transform(GemmKBatch * GemmK0), + make_right_pad_transform(GemmM, PadGemmM), + make_pass_through_transform(GemmK1Number)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + + const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc = + transform_tensor_descriptor( + in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, + make_tuple(make_pass_through_transform(GemmKBatch * GemmK0), + make_right_pad_transform(GemmN, PadGemmN), + make_pass_through_transform(GemmK1Number)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + const auto wei_gemmm_gemmn_pad_grid_desc = transform_tensor_descriptor(wei_grid_desc, make_tuple(make_right_pad_transform(GemmM, PadGemmM), @@ -473,8 +483,8 @@ struct TransformConvBwdWeightToGemmV2 make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, - in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, + return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc, + in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc, wei_gemmm_gemmn_pad_grid_desc); } @@ -552,7 +562,7 @@ struct TransformConvBwdWeightToGemmV2 const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_right_pad_transform(GemmM, PadGemmM)), + make_pass_through_transform(GemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); @@ -568,21 +578,13 @@ struct TransformConvBwdWeightToGemmV2 const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_right_pad_transform(GemmN, PadGemmN)), + make_pass_through_transform(GemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - // Padd - const auto wei_gemmm_gemmn_pad_grid_desc = - transform_tensor_descriptor(wei_grid_desc, - make_tuple(make_right_pad_transform(GemmM, PadGemmM), - make_right_pad_transform(GemmN, PadGemmN)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, - wei_gemmm_gemmn_pad_grid_desc); + wei_grid_desc); } else { @@ -598,7 +600,7 @@ struct TransformConvBwdWeightToGemmV2 const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_right_pad_transform(GemmM, PadGemmM)), + make_pass_through_transform(GemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); @@ -648,11 +650,29 @@ struct TransformConvBwdWeightToGemmV2 const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_right_pad_transform(GemmN, PadGemmN)), + make_pass_through_transform(GemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); // Padd + const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc = + transform_tensor_descriptor( + out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, + make_tuple(make_pass_through_transform(GemmKBatch * GemmK0), + make_right_pad_transform(GemmM, PadGemmM), + make_pass_through_transform(GemmK1Number)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + + const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc = + transform_tensor_descriptor( + in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, + make_tuple(make_pass_through_transform(GemmKBatch * GemmK0), + make_right_pad_transform(GemmN, PadGemmN), + make_pass_through_transform(GemmK1Number)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + const auto wei_gemmm_gemmn_pad_grid_desc = transform_tensor_descriptor(wei_grid_desc, make_tuple(make_right_pad_transform(GemmM, PadGemmM), @@ -660,8 +680,8 @@ struct TransformConvBwdWeightToGemmV2 make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, - in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, + return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc, + in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc, wei_gemmm_gemmn_pad_grid_desc); } } @@ -745,7 +765,7 @@ struct TransformConvBwdWeightToGemmV2 const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_right_pad_transform(GemmM, PadGemmM)), + make_pass_through_transform(GemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); @@ -761,21 +781,13 @@ struct TransformConvBwdWeightToGemmV2 const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_right_pad_transform(GemmN, PadGemmN)), + make_pass_through_transform(GemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - // Padd - const auto wei_gemmm_gemmn_pad_grid_desc = - transform_tensor_descriptor(wei_grid_desc, - make_tuple(make_right_pad_transform(GemmM, PadGemmM), - make_right_pad_transform(GemmN, PadGemmN)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, - wei_gemmm_gemmn_pad_grid_desc); + wei_grid_desc); } else { @@ -791,7 +803,7 @@ struct TransformConvBwdWeightToGemmV2 const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( out_gemmkpad_gemmm_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_right_pad_transform(GemmM, PadGemmM)), + make_pass_through_transform(GemmM)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); @@ -856,11 +868,29 @@ struct TransformConvBwdWeightToGemmV2 const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( in_gemmkpad_gemmn_grid_desc, make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)), - make_right_pad_transform(GemmN, PadGemmN)), + make_pass_through_transform(GemmN)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); // Padd + const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc = + transform_tensor_descriptor( + out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, + make_tuple(make_pass_through_transform(GemmKBatch * GemmK0), + make_right_pad_transform(GemmM, PadGemmM), + make_pass_through_transform(GemmK1Number)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + + const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc = + transform_tensor_descriptor( + in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, + make_tuple(make_pass_through_transform(GemmKBatch * GemmK0), + make_right_pad_transform(GemmN, PadGemmN), + make_pass_through_transform(GemmK1Number)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + const auto wei_gemmm_gemmn_pad_grid_desc = transform_tensor_descriptor(wei_grid_desc, make_tuple(make_right_pad_transform(GemmM, PadGemmM), @@ -868,8 +898,8 @@ struct TransformConvBwdWeightToGemmV2 make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc, - in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc, + return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc, + in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc, wei_gemmm_gemmn_pad_grid_desc); } } // function end From ffdee5e774cf73c3dc35869259ae8f460f969f1b Mon Sep 17 00:00:00 2001 From: Sami Remes Date: Thu, 7 Aug 2025 15:45:27 +0300 Subject: [PATCH 17/21] [CK_TILE] Enable printing more structures in CK-Tile (#2443) * Add more printing to core cktile * Revert other changes in static encoding pattern * Refactor to using a free print() function * Remove loops and print just the containers * Print tuple with better formatting, fix sequence compilation * Add some tests for print utility * Add print utility header * Print for static_encoding_pattern * add buffer_view printing * Align vector_traits * Fix formatting * Lower-case enum strings Co-authored-by: Christopher Millette <63608002+cgmillette@users.noreply.github.com> * Remove empty comment lines * Fix test with lower-case too * Reduce repeated code in print tests, move helper function closer to type definition, test X&Y * Add test_print_common.hpp * add print.hpp in core.hpp --------- Co-authored-by: Aviral Goel Co-authored-by: Christopher Millette <63608002+cgmillette@users.noreply.github.com> Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> --- include/ck_tile/core.hpp | 1 + .../core/algorithm/coordinate_transform.hpp | 419 ++++++++---------- .../algorithm/static_encoding_pattern.hpp | 48 ++ include/ck_tile/core/arch/arch.hpp | 15 + include/ck_tile/core/container/array.hpp | 20 +- include/ck_tile/core/container/map.hpp | 35 +- include/ck_tile/core/container/sequence.hpp | 28 +- include/ck_tile/core/container/tuple.hpp | 21 +- .../core/numeric/integral_constant.hpp | 8 +- include/ck_tile/core/numeric/vector_type.hpp | 4 +- include/ck_tile/core/tensor/buffer_view.hpp | 109 +---- .../ck_tile/core/tensor/tensor_adaptor.hpp | 65 +-- .../ck_tile/core/tensor/tensor_descriptor.hpp | 42 +- .../ck_tile/core/tensor/tile_distribution.hpp | 41 +- .../tensor/tile_distribution_encoding.hpp | 204 ++++----- include/ck_tile/core/utility/print.hpp | 76 ++++ test/ck_tile/CMakeLists.txt | 3 +- test/ck_tile/utility/CMakeLists.txt | 4 + test/ck_tile/utility/print/CMakeLists.txt | 8 + test/ck_tile/utility/print/README.md | 70 +++ .../utility/print/test_print_array.cpp | 59 +++ .../utility/print/test_print_basic_types.cpp | 76 ++++ .../utility/print/test_print_buffer_view.cpp | 78 ++++ .../utility/print/test_print_common.hpp | 25 ++ .../print/test_print_coordinate_transform.cpp | 83 ++++ .../utility/print/test_print_sequence.cpp | 45 ++ .../test_print_static_encoding_pattern.cpp | 89 ++++ .../utility/print/test_print_tuple.cpp | 66 +++ 28 files changed, 1211 insertions(+), 531 deletions(-) create mode 100644 include/ck_tile/core/utility/print.hpp create mode 100644 test/ck_tile/utility/CMakeLists.txt create mode 100644 test/ck_tile/utility/print/CMakeLists.txt create mode 100644 test/ck_tile/utility/print/README.md create mode 100644 test/ck_tile/utility/print/test_print_array.cpp create mode 100644 test/ck_tile/utility/print/test_print_basic_types.cpp create mode 100644 test/ck_tile/utility/print/test_print_buffer_view.cpp create mode 100644 test/ck_tile/utility/print/test_print_common.hpp create mode 100644 test/ck_tile/utility/print/test_print_coordinate_transform.cpp create mode 100644 test/ck_tile/utility/print/test_print_sequence.cpp create mode 100644 test/ck_tile/utility/print/test_print_static_encoding_pattern.cpp create mode 100644 test/ck_tile/utility/print/test_print_tuple.cpp diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp index 188cebaabc..c8945f03e9 100644 --- a/include/ck_tile/core.hpp +++ b/include/ck_tile/core.hpp @@ -74,6 +74,7 @@ #include "ck_tile/core/utility/literals.hpp" #include "ck_tile/core/utility/magic_div.hpp" #include "ck_tile/core/utility/philox_rand.hpp" +#include "ck_tile/core/utility/print.hpp" #include "ck_tile/core/utility/random.hpp" #include "ck_tile/core/utility/reduce_operator.hpp" #include "ck_tile/core/utility/static_counter.hpp" diff --git a/include/ck_tile/core/algorithm/coordinate_transform.hpp b/include/ck_tile/core/algorithm/coordinate_transform.hpp index f7f9489f4c..7511413bba 100644 --- a/include/ck_tile/core/algorithm/coordinate_transform.hpp +++ b/include/ck_tile/core/algorithm/coordinate_transform.hpp @@ -9,6 +9,7 @@ #include "ck_tile/core/utility/functional.hpp" #include "ck_tile/core/utility/type_traits.hpp" #include "ck_tile/core/utility/magic_div.hpp" +#include "ck_tile/core/utility/print.hpp" namespace ck_tile { @@ -139,20 +140,19 @@ struct pass_through : public base_transform<1, 1> { return make_tuple(low_vector_lengths, low_vector_strides); } - - CK_TILE_HOST_DEVICE void print() const - { - printf("pass_through{"); - - // - printf("up_lengths_:"); - print(up_lengths_); - - // - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const pass_through& pt) +{ + printf("pass_through{"); + + printf("up_lengths_: "); + print(pt.get_upper_lengths()); + + printf("}"); +} + template ck_tile::is_known_at_compile_time::value && ck_tile::is_known_at_compile_time::value; } - - CK_TILE_HOST_DEVICE void print() const - { - printf("pad{"); - - // - printf("up_lengths_: "); - print(up_lengths_); - printf(", "); - - // - printf("left_pad_length_: "); - print(left_pad_length_); - printf(", "); - - // - printf("right_pad_length_: "); - print(right_pad_length_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void +print(const pad& p) +{ + printf("pad{"); + printf("up_lengths_: "); + print(p.up_lengths_); + printf(", left_pad_length_: "); + print(p.left_pad_length_); + printf(", right_pad_length_: "); + print(p.right_pad_length_); + printf("}"); +} + template struct left_pad { @@ -330,24 +326,20 @@ struct left_pad // It's up to runtime to check the padding length should be multiple of vector length return make_tuple(low_vector_lengths, low_vector_strides); } - - CK_TILE_HOST_DEVICE void print() const - { - printf("left_pad{"); - - // - printf("up_lengths_: "); - print(up_lengths_); - printf(", "); - - // - printf("left_pad_length_: "); - print(left_pad_length_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void +print(const left_pad& lp) +{ + printf("left_pad{"); + printf("up_lengths_: "); + print(lp.up_lengths_); + printf(", left_pad_length_: "); + print(lp.left_pad_length_); + printf("}"); +} + template struct right_pad : public base_transform<1, 1> { @@ -430,24 +422,20 @@ struct right_pad : public base_transform<1, 1> // It's up to runtime to check the padding length should be multiple of vector length return make_tuple(low_vector_lengths, low_vector_strides); } - - CK_TILE_HOST_DEVICE void print() const - { - printf("right_pad{"); - - // - printf("up_lengths_: "); - print(up_lengths_); - printf(", "); - - // - printf("right_pad_length_: "); - print(right_pad_length_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void +print(const right_pad& rp) +{ + printf("right_pad{"); + printf("up_lengths_: "); + print(rp.up_lengths_); + printf(", right_pad_length_: "); + print(rp.right_pad_length_); + printf("}"); +} + // idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1] // UpLengths and Coefficients can be either of the followings: // 1) Tuple of index_t, which is known at run-time, or @@ -532,24 +520,19 @@ struct embed : public base_transform<1, UpLengths::size()> return ck_tile::is_known_at_compile_time::value && ck_tile::is_known_at_compile_time::value; } - - CK_TILE_HOST_DEVICE void print() const - { - printf("embed{"); - - // - printf("up_lengths_: "); - print(up_lengths_); - printf(", "); - - // - printf("coefficients_: "); - print(coefficients_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const embed& e) +{ + printf("embed{"); + printf("up_lengths_: "); + print(e.up_lengths_); + printf(", coefficients_: "); + print(e.coefficients_); + printf("}"); +} + template struct lambda_merge_generate_MagicDivision_calculate_magic_divisor { @@ -699,24 +682,19 @@ struct merge_v2_magic_division : public base_transform return make_tuple(up_vector_lengths, up_vector_strides); } - - CK_TILE_HOST_DEVICE void print() const - { - printf("merge_v2_magic_division{"); - - // - printf("low_lengths_ "); - print(low_lengths_); - printf(", "); - - // - printf("up_lengths_ "); - print(up_lengths_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const merge_v2_magic_division& m) +{ + printf("merge_v2_magic_division{"); + printf("low_lengths_: "); + print(m.low_lengths_); + printf(", up_lengths_: "); + print(m.up_lengths_); + printf("}"); +} + // Implementation of "merge" transformation primitive that uses division and mod. It is supposed to // be used for low_lengths that are known at compile time and are power of 2, otherwise performance // will be very bad @@ -830,29 +808,21 @@ struct merge_v3_division_mod : public base_transform return make_tuple(up_vector_lengths, up_vector_strides); } - - CK_TILE_HOST_DEVICE void print() const - { - printf("Merge_v3_direct_division_mod{"); - - // - printf("low_lengths_ "); - print(low_lengths_); - printf(", "); - - // - printf("low_lengths_scan_ "); - print(low_lengths_scan_); - printf(", "); - - // - printf("up_lengths_ "); - print(up_lengths_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const merge_v3_division_mod& m) +{ + printf("merge_v3_division_mod{"); + printf("low_lengths_: "); + print(m.low_lengths_); + printf(", low_lengths_scan_: "); + print(m.low_lengths_scan_); + printf(", up_lengths_: "); + print(m.up_lengths_); + printf("}"); +} + template struct unmerge : public base_transform<1, UpLengths::size()> { @@ -958,24 +928,19 @@ struct unmerge : public base_transform<1, UpLengths::size()> return make_tuple(up_vector_lengths, up_vector_strides); } - - CK_TILE_HOST_DEVICE void print() const - { - printf("unmerge{"); - - // - printf("up_lengths_"); - print(up_lengths_); - printf(", "); - - // - printf("up_lengths_scan_"); - print(up_lengths_scan_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const unmerge& u) +{ + printf("unmerge{"); + printf("up_lengths_: "); + print(u.up_lengths_); + printf(", up_lengths_scan_: "); + print(u.up_lengths_scan_); + printf("}"); +} + template struct freeze : public base_transform<1, 0> { @@ -1023,19 +988,17 @@ struct freeze : public base_transform<1, 0> { return ck_tile::is_known_at_compile_time::value; } - - CK_TILE_HOST_DEVICE void print() const - { - printf("freeze{"); - - // - printf("low_idx_: "); - print(low_idx_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const freeze& f) +{ + printf("freeze{"); + printf("low_idx_: "); + print(f.low_idx_); + printf("}"); +} + // insert a dangling upper dimension without lower dimension template struct insert : public base_transform<0, 1> @@ -1092,18 +1055,17 @@ struct insert : public base_transform<0, 1> { return ck_tile::is_known_at_compile_time::value; } - - CK_TILE_HOST_DEVICE void print() const - { - printf("insert{"); - - // - print(up_lengths_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const insert& i) +{ + printf("insert{"); + printf("up_lengths_: "); + print(i.up_lengths_); + printf("}"); +} + // replicate the original tensor and create a higher dimensional tensor template struct replicate : public base_transform<0, UpLengths::size()> @@ -1152,21 +1114,19 @@ struct replicate : public base_transform<0, UpLengths::size()> return ck_tile::is_known_at_compile_time::value; } - CK_TILE_HOST_DEVICE void print() const - { - printf("replicate{"); - - // - printf("up_lengths_: "); - print(up_lengths_); - - printf("}"); - } - // UpLengths up_lengths_; }; +template +CK_TILE_HOST_DEVICE static void print(const replicate& r) +{ + printf("replicate{"); + printf("up_lengths_: "); + print(r.up_lengths_); + printf("}"); +} + template struct slice : public base_transform<1, 1> { @@ -1238,28 +1198,20 @@ struct slice : public base_transform<1, 1> ck_tile::is_known_at_compile_time::value && ck_tile::is_known_at_compile_time::value; } +}; - CK_TILE_HOST_DEVICE void print() const - { - printf("slice{"); - - // - printf("up_lengths_: "); - print(up_lengths_); - printf(", "); - - // - printf("slice_begin_: "); - print(slice_begin_); - printf(", "); - - // - printf("slice_end_: "); - print(slice_end_); - - printf("}"); - } // namespace ck -}; // namespace ck +template +CK_TILE_HOST_DEVICE static void print(const slice& s) +{ + printf("slice{"); + printf("up_lengths_: "); + print(s.up_lengths_); + printf(", slice_begin_: "); + print(s.slice_begin_); + printf(", slice_end_: "); + print(s.slice_end_); + printf("}"); +} /* * \brief lower_idx = upper_idx % modulus. @@ -1328,19 +1280,19 @@ struct modulo : public base_transform<1, 1> { return ck_tile::is_known_at_compile_time::value; } - - CK_TILE_HOST_DEVICE void print() const - { - printf("Modulus{"); - - // - printf("up_lengths_: "); - print(up_lengths_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const modulo& m) +{ + printf("modulo{"); + printf("modulus_: "); + print(m.modulus_); + printf(", up_lengths_: "); + print(m.up_lengths_); + printf("}"); +} + // 2D XOR, NOTE: "xor" is a keyword template struct xor_t : public base_transform<2, 2> @@ -1424,20 +1376,17 @@ struct xor_t : public base_transform<2, 2> return make_tuple(up_vector_lengths, up_vector_strides); } - - CK_TILE_HOST_DEVICE void print() const - { - printf("xor_t{"); - - // - printf("up_lengths_: "); - print(up_lengths_); - printf(", "); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const xor_t& x) +{ + printf("xor_t{"); + printf("up_lengths_: "); + print(x.up_lengths_); + printf("}"); +} + template struct offset : public base_transform<1, 1> { @@ -1509,24 +1458,19 @@ struct offset : public base_transform<1, 1> return ck_tile::is_known_at_compile_time::value && ck_tile::is_known_at_compile_time::value; } - - CK_TILE_HOST_DEVICE void print() const - { - printf("offset{"); - - // - printf("up_lengths_: "); - print(up_lengths_); - printf(", "); - - // - printf("offset_length_: "); - print(offset_length_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const offset& o) +{ + printf("offset{"); + printf("up_lengths_: "); + print(o.up_lengths_); + printf(", offset_length_: "); + print(o.offset_length_); + printf("}"); +} + template struct indexing : public base_transform<1, 1> { @@ -1595,20 +1539,19 @@ struct indexing : public base_transform<1, 1> return ck_tile::is_known_at_compile_time::value && IndexingAdaptor::is_known_at_compile_time(); } - - CK_TILE_HOST_DEVICE void print() const - { - printf("embed{"); - - // - printf("up_lengths_: "); - print(up_lengths_); - printf(", "); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const indexing& i) +{ + printf("indexing{"); + printf("up_lengths_: "); + print(i.up_lengths_); + printf(", iadaptor_: "); + print(i.iadaptor_); + printf("}"); +} + //******************************************************************************************************* template diff --git a/include/ck_tile/core/algorithm/static_encoding_pattern.hpp b/include/ck_tile/core/algorithm/static_encoding_pattern.hpp index 8a3de3e5e0..1f6c389090 100644 --- a/include/ck_tile/core/algorithm/static_encoding_pattern.hpp +++ b/include/ck_tile/core/algorithm/static_encoding_pattern.hpp @@ -77,6 +77,7 @@ #include "ck_tile/core/numeric/integer.hpp" #include "ck_tile/core/tensor/tile_distribution.hpp" #include "ck_tile/core/tensor/tile_distribution_encoding.hpp" +#include "ck_tile/core/utility/print.hpp" namespace ck_tile { @@ -317,4 +318,51 @@ struct TileDistributionEncodingPattern2D +CK_TILE_HOST_DEVICE void print(const TileDistributionEncodingPattern2D&) +{ + using PatternType = TileDistributionEncodingPattern2D; + + printf("TileDistributionEncodingPattern2D: ", + BlockSize, + YPerTile, + XPerTile, + VecSize, + tile_distribution_pattern_to_string(DistributionPattern)); + printf("{: <%d, %d, %d>, : <%d, %d>}\n", + PatternType::Y0, + PatternType::Y1, + PatternType::Y2, + PatternType::X0, + PatternType::X1); +} + } // namespace ck_tile diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp index 96df9d70f7..ab42ec8617 100644 --- a/include/ck_tile/core/arch/arch.hpp +++ b/include/ck_tile/core/arch/arch.hpp @@ -218,4 +218,19 @@ CK_TILE_HOST_DEVICE constexpr index_t get_smem_capacity() #endif } +/// Helper function to convert address space enum to string +CK_TILE_HOST_DEVICE constexpr const char* address_space_to_string(address_space_enum addr_space) +{ + switch(addr_space) + { + case address_space_enum::generic: return "generic"; + case address_space_enum::global: return "global"; + case address_space_enum::lds: return "lds"; + case address_space_enum::sgpr: return "sgpr"; + case address_space_enum::constant: return "constant"; + case address_space_enum::vgpr: return "vgpr"; + default: return "unknown"; + } +} + } // namespace ck_tile diff --git a/include/ck_tile/core/container/array.hpp b/include/ck_tile/core/container/array.hpp index 94aa40e278..352c645325 100644 --- a/include/ck_tile/core/container/array.hpp +++ b/include/ck_tile/core/container/array.hpp @@ -177,9 +177,27 @@ struct array CK_TILE_HOST_DEVICE constexpr array() {} CK_TILE_HOST_DEVICE static constexpr index_t size() { return 0; } CK_TILE_HOST_DEVICE static constexpr bool is_static() { return is_static_v; }; - CK_TILE_HOST_DEVICE void print() const { printf("array{size: 0, data: []}"); } }; +template +CK_TILE_HOST_DEVICE static void print(const array& a) +{ + printf("array{size: %ld, data: [", static_cast(N)); + for(index_t i = 0; i < N; ++i) + { + if(i > 0) + printf(", "); + print(a[i]); + } + printf("]}"); +} + +template +CK_TILE_HOST_DEVICE static void print(const array&) +{ + printf("array{size: 0, data: []}"); +} + template struct vector_traits; diff --git a/include/ck_tile/core/container/map.hpp b/include/ck_tile/core/container/map.hpp index 87b180cafc..7697995c92 100644 --- a/include/ck_tile/core/container/map.hpp +++ b/include/ck_tile/core/container/map.hpp @@ -139,26 +139,21 @@ struct map // WARNING: needed by compiler for C++ range-based for loop only, don't use this function! CK_TILE_HOST_DEVICE constexpr iterator end() { return iterator{impl_, size_}; } - - CK_TILE_HOST_DEVICE void print() const - { - printf("map{size_: %d, ", size_); - // - printf("impl_: ["); - // - for(const auto& [k, d] : *this) - { - printf("{key: "); - print(k); - printf(", data: "); - print(d); - printf("}, "); - } - // - printf("]"); - // - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const map& m) +{ + printf("map{size_: %d, impl_: [", m.size_); + for(const auto& [k, d] : m) + { + printf("{key: "); + print(k); + printf(", data: "); + print(d); + printf("}, "); + } + printf("]}"); +} + } // namespace ck_tile diff --git a/include/ck_tile/core/container/sequence.hpp b/include/ck_tile/core/container/sequence.hpp index 94309dd5dd..905b32dd15 100644 --- a/include/ck_tile/core/container/sequence.hpp +++ b/include/ck_tile/core/container/sequence.hpp @@ -9,13 +9,10 @@ #include "ck_tile/core/numeric/math.hpp" #include "ck_tile/core/utility/to_sequence.hpp" #include "ck_tile/core/utility/type_traits.hpp" -#include "ck_tile/core/utility/functional.hpp" +#include "ck_tile/core/utility/print.hpp" namespace ck_tile { -template -struct static_for; - template struct sequence; @@ -196,15 +193,24 @@ struct sequence { return sequence{}; } - - CK_TILE_HOST_DEVICE static void print() - { - printf("sequence{size: %d, data: [", size()); - ((printf("%d ", Is)), ...); - printf("]}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const sequence&) +{ + printf("sequence<"); + if constexpr(sizeof...(Is) > 0) + { + bool first = true; + (([&first](index_t value) { + printf("%s%d", first ? "" : ", ", value); + first = false; + }(Is)), + ...); + } + printf(">"); +} + namespace impl { template struct __integer_sequence; diff --git a/include/ck_tile/core/container/tuple.hpp b/include/ck_tile/core/container/tuple.hpp index 63d145d8b9..4c48b3d477 100644 --- a/include/ck_tile/core/container/tuple.hpp +++ b/include/ck_tile/core/container/tuple.hpp @@ -300,12 +300,29 @@ struct tuple : impl::tuple_base, T...> #undef TP_COM_ }; -template +template +CK_TILE_HOST_DEVICE void print(const tuple& t) +{ + printf("tuple<"); + if constexpr(sizeof...(T) > 0) + { + bool first = true; + static_for<0, sizeof...(T), 1>{}([&t, &first](auto i) { + if(!first) + printf(", "); + print(t.get(i)); + first = false; + }); + } + printf(">"); +} + +template struct vector_traits; // specialization for array template -struct vector_traits> +struct vector_traits, void> { using scalar_type = __type_pack_element<0, T...>; static constexpr index_t vector_size = sizeof...(T); diff --git a/include/ck_tile/core/numeric/integral_constant.hpp b/include/ck_tile/core/numeric/integral_constant.hpp index 33c24da8c5..2ba2fd10c6 100644 --- a/include/ck_tile/core/numeric/integral_constant.hpp +++ b/include/ck_tile/core/numeric/integral_constant.hpp @@ -19,14 +19,18 @@ struct constant CK_TILE_HOST_DEVICE static constexpr bool is_static() { return true; } }; +template +CK_TILE_HOST_DEVICE static void print(const constant&) +{ + printf("%ld", static_cast(v)); +} + template struct integral_constant : constant { using value_type = T; using type = integral_constant; // using injected-class-name static constexpr T value = v; - // constexpr CK_TILE_HOST_DEVICE operator value_type() const noexcept { return value; } - // constexpr CK_TILE_HOST_DEVICE value_type operator()() const noexcept { return value; } // }; template diff --git a/include/ck_tile/core/numeric/vector_type.hpp b/include/ck_tile/core/numeric/vector_type.hpp index b165275a8c..58bdb43b08 100644 --- a/include/ck_tile/core/numeric/vector_type.hpp +++ b/include/ck_tile/core/numeric/vector_type.hpp @@ -84,7 +84,7 @@ using ext_vector_t = typename impl::ext_vector::type; // by default, any type will result in a vector_size=1 with scalar_type=T traits. // ... unless we have other vector_traits specialization -template +template struct vector_traits { using scalar_type = @@ -94,7 +94,7 @@ struct vector_traits // specialization for ext_vector_type() template -struct vector_traits +struct vector_traits { using scalar_type = std::conditional_t, int8_t, T>; static constexpr index_t vector_size = N; diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp index 4b39773939..ca314a6abe 100644 --- a/include/ck_tile/core/tensor/buffer_view.hpp +++ b/include/ck_tile/core/tensor/buffer_view.hpp @@ -210,28 +210,6 @@ struct buffer_view(const_cast*>(p_data_))); - - // buffer_size_ - printf("buffer_size_: "); - print(buffer_size_); - printf(", "); - - // invalid_element_value_ - printf("invalid_element_value_: "); - print(invalid_element_value_); - - printf("}"); - } }; // Address Space: Global @@ -757,28 +735,6 @@ struct buffer_view(const_cast*>(p_data_))); - - // buffer_size_ - printf("buffer_size_: "); - print(buffer_size_); - printf(", "); - - // invalid_element_value_ - printf("invalid_element_value_: "); - print(invalid_element_value_); - - printf("}"); - } }; // Address Space: LDS @@ -1138,28 +1094,6 @@ struct buffer_view(const_cast*>(p_data_))); - - // buffer_size_ - printf("buffer_size_: "); - print(buffer_size_); - printf(", "); - - // invalid_element_value_ - printf("invalid_element_value_: "); - print(invalid_element_value_); - - printf("}"); - } }; // Address Space: Vgpr @@ -1313,28 +1247,6 @@ struct buffer_view(const_cast*>(p_data_))); - - // buffer_size_ - printf("buffer_size_: "); - print(buffer_size_); - printf(", "); - - // invalid_element_value_ - printf("invalid_element_value_: "); - print(invalid_element_value_); - - printf("}"); - } }; template +CK_TILE_HOST_DEVICE void print(const buffer_view& bv) +{ + printf("buffer_view{AddressSpace: %s, p_data_: %p, buffer_size_: ", + address_space_to_string(BufferAddressSpace), + static_cast(const_cast*>(bv.p_data_))); + print(bv.buffer_size_); + printf(", invalid_element_value_: "); + print(bv.invalid_element_value_); + printf("}"); +} + } // namespace ck_tile diff --git a/include/ck_tile/core/tensor/tensor_adaptor.hpp b/include/ck_tile/core/tensor/tensor_adaptor.hpp index e2a6ae6555..ec5538d79c 100644 --- a/include/ck_tile/core/tensor/tensor_adaptor.hpp +++ b/include/ck_tile/core/tensor/tensor_adaptor.hpp @@ -305,42 +305,45 @@ struct tensor_adaptor get_container_subset(vector_strides, top_dims)); } - CK_TILE_HOST_DEVICE void print() const - { - printf("tensor_adaptor{"); - - // - printf("transforms: "); - print(transforms_); - printf(", "); - - // - printf("LowerDimensionHiddenIds: "); - print(LowerDimensionHiddenIdss{}); - printf(", "); - - // - printf("UpperDimensionHiddenIds: "); - print(UpperDimensionHiddenIdss{}); - printf(", "); - - // - printf("BottomDimensionHiddenIds: "); - print(BottomDimensionHiddenIds{}); - printf(", "); - - // - printf("TopDimensionHiddenIds: "); - print(TopDimensionHiddenIds{}); - - printf("}"); - } - private: Transforms transforms_; ElementSize element_size_; }; +template +CK_TILE_HOST_DEVICE static void print(const tensor_adaptor& adaptor) +{ + printf("tensor_adaptor{\n"); + printf(" transforms: ["); + print(adaptor.get_transforms()); + printf("],\n"); + + printf(" LowerDimensionHiddenIds: ["); + print(LowerDimensionHiddenIdss{}); + printf("],\n"); + + printf(" UpperDimensionHiddenIds: ["); + print(UpperDimensionHiddenIdss{}); + printf("],\n"); + + printf(" BottomDimensionHiddenIds: ["); + print(BottomDimensionHiddenIds{}); + printf("],\n"); + + // + printf(" TopDimensionHiddenIds: ["); + print(TopDimensionHiddenIds{}); + printf("]\n}\n"); +} + // Transforms: Tuple // LowerDimensionOldTopIdss: Tuple, ...> // UpperDimensionNewTopIdss: Tuple, ...> diff --git a/include/ck_tile/core/tensor/tensor_descriptor.hpp b/include/ck_tile/core/tensor/tensor_descriptor.hpp index 0c3e04f315..0e4787a2f1 100644 --- a/include/ck_tile/core/tensor/tensor_descriptor.hpp +++ b/include/ck_tile/core/tensor/tensor_descriptor.hpp @@ -140,25 +140,37 @@ struct tensor_descriptor : public tensor_adaptor(GuaranteedVectorStrides{})); } - CK_TILE_HOST_DEVICE void print() const - { - printf("tensor_descriptor{"); - - // tensor_adaptor - Base::print(); - printf(", "); - - // element_space_size_ - printf("element_space_size_: "); - print(element_space_size_); - - printf("}"); - } - // TODO make these private ElementSpaceSize element_space_size_; }; +template +CK_TILE_HOST_DEVICE static void print(const tensor_descriptor& descriptor) +{ + printf("tensor_descriptor{\n"); + // first print the tensor adaptor part of the descriptor using the base class print + print(static_cast(descriptor)); + printf("element_space_size_: %ld,\n", + static_cast(descriptor.get_element_space_size().value)); + printf("guaranteed_vector_lengths: "); + print(GuaranteedVectorLengths{}); + printf(",\nguaranteed_vector_strides: "); + print(GuaranteedVectorStrides{}); + printf("}\n}\n"); +} + template CK_TILE_HOST_DEVICE constexpr auto make_tensor_descriptor_from_adaptor(const Adaptor& adaptor, diff --git a/include/ck_tile/core/tensor/tile_distribution.hpp b/include/ck_tile/core/tensor/tile_distribution.hpp index 11e6b35c39..bc02ec74d2 100644 --- a/include/ck_tile/core/tensor/tile_distribution.hpp +++ b/include/ck_tile/core/tensor/tile_distribution.hpp @@ -228,24 +228,6 @@ struct tile_distribution { return PsYs2XsAdaptor::is_static() && Ys2DDescriptor::is_static(); } - - CK_TILE_HOST_DEVICE void print() const - { - printf("tile_distribution{"); - // - printf("tile_distribution_encoding: "); - print(DstrEncode{}); - printf(", "); - // - printf("ps_ys_to_xs_: "); - print(ps_ys_to_xs_); - printf(", "); - // - printf("ys_to_d_: "); - print(ys_to_d_); - // - printf("}"); - } }; namespace detail { @@ -710,4 +692,27 @@ CK_TILE_HOST_DEVICE constexpr auto slice_distribution_from_x( } } // namespace detail + +// Free print function for tile_distribution +template +CK_TILE_HOST_DEVICE void print(const tile_distribution& distribution) +{ + printf("tile_distribution{"); + printf("tile_distribution_encoding: "); + print(StaticTileDistributionEncoding_{}); + printf(", "); + printf("ps_ys_to_xs_: "); + print(distribution.ps_ys_to_xs_); + printf(", "); + printf("ys_to_d_: "); + print(distribution.ys_to_d_); + printf("}\n"); +} + } // namespace ck_tile diff --git a/include/ck_tile/core/tensor/tile_distribution_encoding.hpp b/include/ck_tile/core/tensor/tile_distribution_encoding.hpp index b380e7c9d8..90d1a2ccb2 100644 --- a/include/ck_tile/core/tensor/tile_distribution_encoding.hpp +++ b/include/ck_tile/core/tensor/tile_distribution_encoding.hpp @@ -428,109 +428,7 @@ struct tile_distribution_encoding { return get_sorted_info(get_uniformed_idx_y_to_h(), get_h_dim_lengths_prefix_sum()); } - - CK_TILE_HOST_DEVICE void print() const - { - printf("tile_distribution_encoding::detail{"); - // - printf("ndim_rh_major_: "); - print(ndim_rh_major_); - printf(", "); - // - printf("ndim_span_major_: "); - print(ndim_span_major_); - printf(", "); - // - printf("ndims_rhs_minor_: "); - print(ndims_rhs_minor_); - printf(", "); - // - printf("ndim_rh_major_: "); - print(ndim_rh_major_); - printf(", "); - // - printf("max_ndim_rh_minor_: "); - print(max_ndim_rh_minor_); - printf(", "); - // - printf("rhs_lengthss_: "); - print(rhs_lengthss_); - printf(", "); - // - printf("ys_lengths_: "); - print(ys_lengths_); - printf(", "); - // - printf("rhs_major_minor_to_ys_: "); - print(rhs_major_minor_to_ys_); - printf(", "); - // - printf("ndims_span_minor_: "); - print(ndims_span_minor_); - printf(", "); - // - printf("max_ndim_span_minor_: "); - print(max_ndim_span_minor_); - printf(", "); - // - printf("ys_to_span_major_: "); - print(ys_to_span_major_); - printf(", "); - // - printf("ys_to_span_minor_: "); - print(ys_to_span_minor_); - printf(", "); - // - printf("distributed_spans_lengthss_: "); - print(distributed_spans_lengthss_); - printf(", "); - // - printf("ndims_distributed_spans_minor_: "); - print(ndims_distributed_spans_minor_); - printf(", "); - // - printf("ps_over_rs_derivative_: "); - print(ps_over_rs_derivative_); - // - printf("}"); - } }; - - CK_TILE_HOST_DEVICE void print() const - { - printf("tile_distribution_encoding{"); - // - printf("NDimX: %d, NDimP: %d, NDimY: %d, ", NDimX, NDimP, NDimY); - // - printf("rs_lengths_: "); - print(rs_lengths_); - printf(", "); - // - printf("hs_lengthss_: "); - print(hs_lengthss_); - printf(", "); - // - printf("ps_to_rhss_major_: "); - print(ps_to_rhss_major_); - printf(", "); - // - printf("ps_to_rhss_minor_: "); - print(ps_to_rhss_minor_); - printf(", "); - // - printf("ys_to_rhs_major_: "); - print(ys_to_rhs_major_); - printf(", "); - // - printf("ys_to_rhs_minor_: "); - print(ys_to_rhs_minor_); - printf(", "); - // - printf("detail: "); - print(detail{}); - // - printf("}"); - } }; template @@ -896,4 +794,106 @@ make_reduce_tile_distribution_encoding(InDstr, sequence reduce } } // namespace detail + +// Free print function for tile_distribution_encoding::detail +template +CK_TILE_HOST_DEVICE void +print(const typename tile_distribution_encoding::detail& detail_obj) +{ + printf("tile_distribution_encoding::detail{"); + printf("ndim_rh_major_: "); + print(detail_obj.ndim_rh_major_); + printf(", "); + printf("ndim_span_major_: "); + print(detail_obj.ndim_span_major_); + printf(", "); + printf("ndims_rhs_minor_: "); + print(detail_obj.ndims_rhs_minor_); + printf(", "); + printf("ndim_rh_major_: "); + print(detail_obj.ndim_rh_major_); + printf(", "); + printf("max_ndim_rh_minor_: "); + print(detail_obj.max_ndim_rh_minor_); + printf(", "); + printf("rhs_lengthss_: "); + print(detail_obj.rhs_lengthss_); + printf(", "); + printf("ys_lengths_: "); + print(detail_obj.ys_lengths_); + printf(", "); + printf("rhs_major_minor_to_ys_: "); + print(detail_obj.rhs_major_minor_to_ys_); + printf(", "); + printf("ndims_span_minor_: "); + print(detail_obj.ndims_span_minor_); + printf(", "); + printf("max_ndim_span_minor_: "); + print(detail_obj.max_ndim_span_minor_); + printf(", "); + printf("ys_to_span_major_: "); + print(detail_obj.ys_to_span_major_); + printf(", "); + printf("ys_to_span_minor_: "); + print(detail_obj.ys_to_span_minor_); + printf(", "); + printf("distributed_spans_lengthss_: "); + print(detail_obj.distributed_spans_lengthss_); + printf(", "); + printf("ndims_distributed_spans_minor_: "); + print(detail_obj.ndims_distributed_spans_minor_); + printf(", "); + printf("ps_over_rs_derivative_: "); + print(detail_obj.ps_over_rs_derivative_); + printf("}"); +} + +// Free print function for tile_distribution_encoding +template +CK_TILE_HOST_DEVICE void print(const tile_distribution_encoding& encoding) +{ + printf("tile_distribution_encoding{"); + + printf("NDimX: %d, NDimP: %d, NDimY: %d, ", encoding.NDimX, encoding.NDimP, encoding.NDimY); + printf("rs_lengths_: "); + print(encoding.rs_lengths_); + printf(", "); + printf("hs_lengthss_: "); + print(encoding.hs_lengthss_); + printf(", "); + printf("ps_to_rhss_major_: "); + print(encoding.ps_to_rhss_major_); + printf(", "); + printf("ps_to_rhss_minor_: "); + print(encoding.ps_to_rhss_minor_); + printf(", "); + printf("ys_to_rhs_major_: "); + print(encoding.ys_to_rhs_major_); + printf(", "); + printf("ys_to_rhs_minor_: "); + print(encoding.ys_to_rhs_minor_); + printf(", "); + printf("}"); +} + } // namespace ck_tile diff --git a/include/ck_tile/core/utility/print.hpp b/include/ck_tile/core/utility/print.hpp new file mode 100644 index 0000000000..04635959af --- /dev/null +++ b/include/ck_tile/core/utility/print.hpp @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core/config.hpp" + +namespace ck_tile { + +/// Declare a ck_tile::print() interface that gets specialized in each header file for types that +/// can be printed. +template +CK_TILE_HOST_DEVICE void print(const T&) +{ + static_assert(sizeof(T) == 0, + "No print implementation available for this type. Please specialize " + "ck_tile::print for your type."); +} + +/// Specialization for int +template <> +CK_TILE_HOST_DEVICE void print(const int& value) +{ + printf("%d", value); +} + +/// Specialization for float +template <> +CK_TILE_HOST_DEVICE void print(const float& value) +{ + printf("%f", value); +} + +/// Specialization for double +template <> +CK_TILE_HOST_DEVICE void print(const double& value) +{ + printf("%f", value); +} + +/// Specialization for long +template <> +CK_TILE_HOST_DEVICE void print(const long& value) +{ + printf("%ld", value); +} + +/// Specialization for unsigned int +template <> +CK_TILE_HOST_DEVICE void print(const unsigned int& value) +{ + printf("%u", value); +} + +/// Specialization for char +template <> +CK_TILE_HOST_DEVICE void print(const char& value) +{ + printf("%c", value); +} + +/// Specialization for array +template +CK_TILE_HOST_DEVICE void print(const T (&value)[N]) +{ + printf("["); + for(size_t i = 0; i < N; ++i) + { + if(i > 0) + printf(", "); + print(value[i]); // Recursively call print for each element + } + printf("]"); +} + +} // namespace ck_tile diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt index 9a1df56208..374e5b4990 100644 --- a/test/ck_tile/CMakeLists.txt +++ b/test/ck_tile/CMakeLists.txt @@ -21,4 +21,5 @@ add_subdirectory(add_rmsnorm2d_rdquant) # add_subdirectory(layernorm2d) # add_subdirectory(rmsnorm2d) add_subdirectory(gemm_block_scale) -add_subdirectory(reduce) \ No newline at end of file +add_subdirectory(utility) +add_subdirectory(reduce) diff --git a/test/ck_tile/utility/CMakeLists.txt b/test/ck_tile/utility/CMakeLists.txt new file mode 100644 index 0000000000..c57cafca5a --- /dev/null +++ b/test/ck_tile/utility/CMakeLists.txt @@ -0,0 +1,4 @@ +message("-- Adding: test/ck_tile/utility/") + +# Add print tests +add_subdirectory(print) diff --git a/test/ck_tile/utility/print/CMakeLists.txt b/test/ck_tile/utility/print/CMakeLists.txt new file mode 100644 index 0000000000..5300dd20ca --- /dev/null +++ b/test/ck_tile/utility/print/CMakeLists.txt @@ -0,0 +1,8 @@ +# Print utility tests +add_gtest_executable(test_print_sequence test_print_sequence.cpp) +add_gtest_executable(test_print_array test_print_array.cpp) +add_gtest_executable(test_print_tuple test_print_tuple.cpp) +add_gtest_executable(test_print_coordinate_transform test_print_coordinate_transform.cpp) +add_gtest_executable(test_print_static_encoding_pattern test_print_static_encoding_pattern.cpp) +add_gtest_executable(test_print_buffer_view test_print_buffer_view.cpp) +add_gtest_executable(test_print_basic_types test_print_basic_types.cpp) diff --git a/test/ck_tile/utility/print/README.md b/test/ck_tile/utility/print/README.md new file mode 100644 index 0000000000..558c6faee4 --- /dev/null +++ b/test/ck_tile/utility/print/README.md @@ -0,0 +1,70 @@ +# Print Function Tests + +This directory contains unit tests for testing the print functionality of various data structures and coordinate transformations in the composable_kernel library. + +## Tests Included + +### test_print_sequence.cpp +Tests the print functionality for `sequence<...>` containers: +- Simple sequences with multiple elements +- Single element sequences +- Empty sequences +- Longer sequences + +### test_print_array.cpp +Tests the print functionality for `array` containers: +- Arrays with integer values +- Single element arrays +- Empty arrays (size 0) +- Arrays with floating point values + +### test_print_tuple.cpp +Tests the print functionality for `tuple<...>` containers: +- Simple tuples with numbers +- Single element tuples +- Empty tuples +- Mixed type tuples + +### test_print_coordinate_transform.cpp +Tests the print functionality for coordinate transformation structures: +- `pass_through` transform +- `embed` transform +- `merge` transform +- `unmerge` transform +- `freeze` transform + +## Testing Approach + +All tests use Google Test's `CaptureStdout()` functionality to capture the output from print functions and verify the formatting: + +```cpp +testing::internal::CaptureStdout(); +print(object); +std::string output = testing::internal::GetCapturedStdout(); +EXPECT_EQ(output, "expected_format"); +``` + +This approach enables testing of print function output without affecting the console during test execution. + +## Building and Running + +The tests are integrated into the CMake build system. To build and run the print tests: + +```bash +# Build the specific test +make test_print_sequence + +# Run the test +./test_print_sequence + +# Or run all print tests using CTest +ctest -R "test_print" +``` + +## Adding New Tests + +To add tests for new data structures: + +1. Create a new test file: `test_print_.cpp` +2. Follow the existing pattern using `CaptureStdout()` +3. Add the test executable to `CMakeLists.txt` diff --git a/test/ck_tile/utility/print/test_print_array.cpp b/test/ck_tile/utility/print/test_print_array.cpp new file mode 100644 index 0000000000..2fe9bc2a0c --- /dev/null +++ b/test/ck_tile/utility/print/test_print_array.cpp @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "test_print_common.hpp" +#include "ck_tile/core/container/array.hpp" +#include "ck_tile/core/utility/print.hpp" + +namespace ck_tile { + +class PrintArrayTest : public PrintTest +{ +}; + +TEST_F(PrintArrayTest, PrintIntArray) +{ + // Test printing array + array arr{10, 20, 30}; + + std::string output = CapturePrintOutput(arr); + + // The expected format should match the array print function implementation + EXPECT_EQ(output, "array{size: 3, data: [10, 20, 30]}"); +} + +TEST_F(PrintArrayTest, PrintSingleElementArray) +{ + // Test printing array + array arr{42}; + + std::string output = CapturePrintOutput(arr); + + EXPECT_EQ(output, "array{size: 1, data: [42]}"); +} + +TEST_F(PrintArrayTest, PrintEmptyArray) +{ + // Test printing array (empty array) + array arr{}; + + std::string output = CapturePrintOutput(arr); + + EXPECT_EQ(output, "array{size: 0, data: []}"); +} + +TEST_F(PrintArrayTest, PrintFloatArray) +{ + // Test printing array with float values + array arr{3.14f, 2.71f}; + + std::string output = CapturePrintOutput(arr); + + // Note: float printing format may vary, so we'll test for basic structure + EXPECT_TRUE(output.find("array{size: 2, data: [") == 0); + EXPECT_TRUE(output.find("3.14") != std::string::npos); + EXPECT_TRUE(output.find("2.71") != std::string::npos); + EXPECT_TRUE(output.find("]}") == output.length() - 2); +} + +} // namespace ck_tile diff --git a/test/ck_tile/utility/print/test_print_basic_types.cpp b/test/ck_tile/utility/print/test_print_basic_types.cpp new file mode 100644 index 0000000000..7a26b6371a --- /dev/null +++ b/test/ck_tile/utility/print/test_print_basic_types.cpp @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "test_print_common.hpp" +#include "ck_tile/core/utility/print.hpp" + +namespace ck_tile { + +class PrintBasicTypesTest : public PrintTest +{ +}; + +TEST_F(PrintBasicTypesTest, PrintIntArray) +{ + int arr[4] = {1, 2, 3, 4}; + + std::string output = CapturePrintOutput(arr); + + EXPECT_EQ(output, "[1, 2, 3, 4]"); +} + +TEST_F(PrintBasicTypesTest, PrintFloatArray) +{ + float arr[3] = {1.5f, 2.5f, 3.5f}; + + std::string output = CapturePrintOutput(arr); + + // Note: floating point formatting may vary, so we check for key elements + EXPECT_TRUE(output.find("[") == 0); + EXPECT_TRUE(output.find("1.5") != std::string::npos); + EXPECT_TRUE(output.find("2.5") != std::string::npos); + EXPECT_TRUE(output.find("3.5") != std::string::npos); + EXPECT_TRUE(output.back() == ']'); + EXPECT_TRUE(output.find(", ") != std::string::npos); +} + +TEST_F(PrintBasicTypesTest, PrintDoubleArray) +{ + double arr[2] = {10.123, 20.456}; + + std::string output = CapturePrintOutput(arr); + + EXPECT_TRUE(output.find("[") == 0); + EXPECT_TRUE(output.find("10.123") != std::string::npos); + EXPECT_TRUE(output.find("20.456") != std::string::npos); + EXPECT_TRUE(output.back() == ']'); +} + +TEST_F(PrintBasicTypesTest, PrintUnsignedIntArray) +{ + unsigned int arr[3] = {100u, 200u, 300u}; + + std::string output = CapturePrintOutput(arr); + + EXPECT_EQ(output, "[100, 200, 300]"); +} + +TEST_F(PrintBasicTypesTest, PrintCharArray) +{ + char arr[5] = {'a', 'b', 'c', 'd', 'e'}; + + std::string output = CapturePrintOutput(arr); + + EXPECT_EQ(output, "[a, b, c, d, e]"); +} + +TEST_F(PrintBasicTypesTest, PrintSingleElementArray) +{ + int arr[1] = {42}; + + std::string output = CapturePrintOutput(arr); + + EXPECT_EQ(output, "[42]"); +} + +} // namespace ck_tile diff --git a/test/ck_tile/utility/print/test_print_buffer_view.cpp b/test/ck_tile/utility/print/test_print_buffer_view.cpp new file mode 100644 index 0000000000..66668a2103 --- /dev/null +++ b/test/ck_tile/utility/print/test_print_buffer_view.cpp @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "test_print_common.hpp" +#include "ck_tile/core/tensor/buffer_view.hpp" +#include "ck_tile/core/utility/print.hpp" + +namespace ck_tile { + +class PrintBufferViewTest : public PrintTest +{ +}; + +TEST_F(PrintBufferViewTest, PrintGenericBufferView) +{ + // Test printing generic address space buffer_view + float data[4] = {100.f, 200.f, 300.f, 400.f}; + auto bv = make_buffer_view(&data, 4); + + std::string output = CapturePrintOutput(bv); + + // Verify the output contains expected information + EXPECT_TRUE(output.find("buffer_view{AddressSpace: generic") != std::string::npos); + EXPECT_TRUE(output.find("p_data_:") != std::string::npos); + EXPECT_TRUE(output.find("buffer_size_:") != std::string::npos); + EXPECT_TRUE(output.find("invalid_element_value_:") != std::string::npos); + EXPECT_TRUE(output.find("}") != std::string::npos); +} + +TEST_F(PrintBufferViewTest, PrintGlobalBufferView) +{ + // Test printing global address space buffer_view + float data[4] = {100.f, 200.f, 300.f, 400.f}; + auto bv = make_buffer_view(&data, 4); + + std::string output = CapturePrintOutput(bv); + + // Verify the output contains expected information + EXPECT_TRUE(output.find("buffer_view{AddressSpace: global") != std::string::npos); + EXPECT_TRUE(output.find("p_data_:") != std::string::npos); + EXPECT_TRUE(output.find("buffer_size_:") != std::string::npos); + EXPECT_TRUE(output.find("invalid_element_value_:") != std::string::npos); + EXPECT_TRUE(output.find("}") != std::string::npos); +} + +TEST_F(PrintBufferViewTest, PrintLdsBufferView) +{ + // Test printing LDS address space buffer_view + float data[4] = {100.f, 200.f, 300.f, 400.f}; + auto bv = make_buffer_view(data, 4); + + std::string output = CapturePrintOutput(bv); + + // Verify the output contains expected information + EXPECT_TRUE(output.find("buffer_view{AddressSpace: lds") != std::string::npos); + EXPECT_TRUE(output.find("p_data_:") != std::string::npos); + EXPECT_TRUE(output.find("buffer_size_:") != std::string::npos); + EXPECT_TRUE(output.find("invalid_element_value_:") != std::string::npos); + EXPECT_TRUE(output.find("}") != std::string::npos); +} + +TEST_F(PrintBufferViewTest, PrintVgprBufferView) +{ + // Test printing VGPR address space buffer_view + float data[4] = {1.5f, 2.5f, 3.5f, 4.5f}; + auto bv = make_buffer_view(data, 4); + + std::string output = CapturePrintOutput(bv); + + // Verify the output contains expected information + EXPECT_TRUE(output.find("buffer_view{AddressSpace: vgpr") != std::string::npos); + EXPECT_TRUE(output.find("p_data_:") != std::string::npos); + EXPECT_TRUE(output.find("buffer_size_:") != std::string::npos); + EXPECT_TRUE(output.find("invalid_element_value_:") != std::string::npos); + EXPECT_TRUE(output.find("}") != std::string::npos); +} + +} // namespace ck_tile diff --git a/test/ck_tile/utility/print/test_print_common.hpp b/test/ck_tile/utility/print/test_print_common.hpp new file mode 100644 index 0000000000..3ba2270802 --- /dev/null +++ b/test/ck_tile/utility/print/test_print_common.hpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck_tile/core/utility/print.hpp" + +class PrintTest : public ::testing::Test +{ + protected: + void SetUp() override {} + void TearDown() override {} + // Helper function to capture and return the output of a print function + template + std::string CapturePrintOutput(const T& type) + { + using namespace ck_tile; + testing::internal::CaptureStdout(); + print(type); + return testing::internal::GetCapturedStdout(); + } +}; diff --git a/test/ck_tile/utility/print/test_print_coordinate_transform.cpp b/test/ck_tile/utility/print/test_print_coordinate_transform.cpp new file mode 100644 index 0000000000..639b113eb7 --- /dev/null +++ b/test/ck_tile/utility/print/test_print_coordinate_transform.cpp @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "test_print_common.hpp" +#include "ck_tile/core/algorithm/coordinate_transform.hpp" +#include "ck_tile/core/utility/print.hpp" + +namespace ck_tile { + +class PrintCoordinateTransformTest : public PrintTest +{ +}; + +TEST_F(PrintCoordinateTransformTest, PrintPassThrough) +{ + // Test printing pass_through transform + auto pt = make_pass_through_transform(number<32>{}); + + std::string output = CapturePrintOutput(pt); + + // Verify it contains the pass_through identifier and some structure + EXPECT_TRUE(output.find("pass_through{") == 0); + EXPECT_TRUE(output.find("up_lengths_") != std::string::npos); + EXPECT_TRUE(output.back() == '}'); +} + +TEST_F(PrintCoordinateTransformTest, PrintEmbed) +{ + // Test printing embed transform + auto embed_transform = make_embed_transform(make_tuple(number<4>{}, number<8>{}), + make_tuple(number<1>{}, number<4>{})); + + std::string output = CapturePrintOutput(embed_transform); + + // Verify it contains the embed identifier and key fields + EXPECT_TRUE(output.find("embed{") == 0); + EXPECT_TRUE(output.find("up_lengths_") != std::string::npos); + EXPECT_TRUE(output.find("coefficients_") != std::string::npos); + EXPECT_TRUE(output.back() == '}'); +} + +TEST_F(PrintCoordinateTransformTest, PrintMerge) +{ + // Test printing merge transform + auto merge_transform = make_merge_transform(make_tuple(number<4>{}, number<8>{})); + + std::string output = CapturePrintOutput(merge_transform); + + // Verify it contains merge identifier and key fields + EXPECT_TRUE(output.find("merge") == + 0); // Could be merge_v2_magic_division or merge_v3_division_mod + EXPECT_TRUE(output.find("low_lengths_") != std::string::npos || + output.find("up_lengths_") != std::string::npos); + EXPECT_TRUE(output.back() == '}'); +} + +TEST_F(PrintCoordinateTransformTest, PrintUnmerge) +{ + // Test printing unmerge transform + auto unmerge_transform = make_unmerge_transform(make_tuple(number<4>{}, number<8>{})); + + std::string output = CapturePrintOutput(unmerge_transform); + + // Verify it contains the unmerge identifier and key fields + EXPECT_TRUE(output.find("unmerge{") == 0); + EXPECT_TRUE(output.find("up_lengths_") != std::string::npos); + EXPECT_TRUE(output.back() == '}'); +} + +TEST_F(PrintCoordinateTransformTest, PrintFreeze) +{ + // Test printing freeze transform + auto freeze_transform = make_freeze_transform(number<5>{}); + + std::string output = CapturePrintOutput(freeze_transform); + + // Verify it contains the freeze identifier and key fields + EXPECT_TRUE(output.find("freeze{") == 0); + EXPECT_TRUE(output.find("low_idx_") != std::string::npos); + EXPECT_TRUE(output.back() == '}'); +} + +} // namespace ck_tile diff --git a/test/ck_tile/utility/print/test_print_sequence.cpp b/test/ck_tile/utility/print/test_print_sequence.cpp new file mode 100644 index 0000000000..e73a9f7e33 --- /dev/null +++ b/test/ck_tile/utility/print/test_print_sequence.cpp @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "test_print_common.hpp" +#include "ck_tile/core/utility/print.hpp" +#include "ck_tile/core/container/sequence.hpp" + +namespace ck_tile { + +class PrintSequenceTest : public PrintTest +{ +}; + +TEST_F(PrintSequenceTest, PrintSimpleSequence) +{ + // Test printing sequence<1, 5, 8> + constexpr auto seq = sequence<1, 5, 8>{}; + + std::string output = CapturePrintOutput(seq); + + // Verify the output format + EXPECT_EQ(output, "sequence<1, 5, 8>"); +} + +TEST_F(PrintSequenceTest, PrintSingleElementSequence) +{ + // Test printing sequence<42> + constexpr auto seq = sequence<42>{}; + + std::string output = CapturePrintOutput(seq); + + EXPECT_EQ(output, "sequence<42>"); +} + +TEST_F(PrintSequenceTest, PrintEmptySequence) +{ + // Test printing sequence<> (empty sequence) + constexpr auto seq = sequence<>{}; + + std::string output = CapturePrintOutput(seq); + + EXPECT_EQ(output, "sequence<>"); +} + +} // namespace ck_tile diff --git a/test/ck_tile/utility/print/test_print_static_encoding_pattern.cpp b/test/ck_tile/utility/print/test_print_static_encoding_pattern.cpp new file mode 100644 index 0000000000..d1cb408b5c --- /dev/null +++ b/test/ck_tile/utility/print/test_print_static_encoding_pattern.cpp @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "test_print_common.hpp" +#include "ck_tile/core/algorithm/static_encoding_pattern.hpp" +#include "ck_tile/core/utility/print.hpp" + +#include + +namespace ck_tile { + +class PrintStaticEncodingPatternTest : public PrintTest +{ + protected: + void TestY0Y1Y2(const std::string& output, auto Y0, auto Y1, auto Y2) + { + std::stringstream expected; + expected << ": <" << Y0 << ", " << Y1 << ", " << Y2 << ">"; + EXPECT_TRUE(output.find(expected.str()) != std::string::npos); + } + void TestX0X1(const std::string& output, auto X0, auto X1) + { + std::stringstream expected; + expected << ": <" << X0 << ", " << X1 << ">"; + EXPECT_TRUE(output.find(expected.str()) != std::string::npos); + } +}; + +TEST_F(PrintStaticEncodingPatternTest, PrintThreadRakedPattern) +{ + // Test printing thread raked pattern + using PatternType = + TileDistributionEncodingPattern2D<64, 8, 16, 4, tile_distribution_pattern::thread_raked>; + PatternType pattern; + + std::string output = CapturePrintOutput(pattern); + + // Verify the output contains expected information + EXPECT_TRUE(output.find("TileDistributionEncodingPattern2D") != std::string::npos); + EXPECT_TRUE(output.find("BlockSize:64") != std::string::npos); + EXPECT_TRUE(output.find("YPerTile:8") != std::string::npos); + EXPECT_TRUE(output.find("XPerTile:16") != std::string::npos); + EXPECT_TRUE(output.find("VecSize:4") != std::string::npos); + EXPECT_TRUE(output.find("thread_raked") != std::string::npos); + TestY0Y1Y2(output, PatternType::Y0, PatternType::Y1, PatternType::Y2); + TestX0X1(output, PatternType::X0, PatternType::X1); +} + +TEST_F(PrintStaticEncodingPatternTest, PrintWarpRakedPattern) +{ + // Test printing warp raked pattern + using PatternType = + TileDistributionEncodingPattern2D<128, 16, 32, 8, tile_distribution_pattern::warp_raked>; + PatternType pattern; + + std::string output = CapturePrintOutput(pattern); + + // Verify the output contains expected information + EXPECT_TRUE(output.find("TileDistributionEncodingPattern2D") != std::string::npos); + EXPECT_TRUE(output.find("BlockSize:128") != std::string::npos); + EXPECT_TRUE(output.find("YPerTile:16") != std::string::npos); + EXPECT_TRUE(output.find("XPerTile:32") != std::string::npos); + EXPECT_TRUE(output.find("VecSize:8") != std::string::npos); + EXPECT_TRUE(output.find("warp_raked") != std::string::npos); + TestY0Y1Y2(output, PatternType::Y0, PatternType::Y1, PatternType::Y2); + TestX0X1(output, PatternType::X0, PatternType::X1); +} + +TEST_F(PrintStaticEncodingPatternTest, PrintBlockRakedPattern) +{ + // Test printing block raked pattern + using PatternType = + TileDistributionEncodingPattern2D<256, 32, 64, 16, tile_distribution_pattern::block_raked>; + PatternType pattern; + + std::string output = CapturePrintOutput(pattern); + + // Verify the output contains expected information + EXPECT_TRUE(output.find("TileDistributionEncodingPattern2D") != std::string::npos); + EXPECT_TRUE(output.find("BlockSize:256") != std::string::npos); + EXPECT_TRUE(output.find("YPerTile:32") != std::string::npos); + EXPECT_TRUE(output.find("XPerTile:64") != std::string::npos); + EXPECT_TRUE(output.find("VecSize:16") != std::string::npos); + EXPECT_TRUE(output.find("block_raked") != std::string::npos); + TestY0Y1Y2(output, PatternType::Y0, PatternType::Y1, PatternType::Y2); + TestX0X1(output, PatternType::X0, PatternType::X1); +} + +} // namespace ck_tile diff --git a/test/ck_tile/utility/print/test_print_tuple.cpp b/test/ck_tile/utility/print/test_print_tuple.cpp new file mode 100644 index 0000000000..79aaf1b3af --- /dev/null +++ b/test/ck_tile/utility/print/test_print_tuple.cpp @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "test_print_common.hpp" +#include "ck_tile/core/container/tuple.hpp" +#include "ck_tile/core/numeric/integral_constant.hpp" +#include "ck_tile/core/utility/print.hpp" + +namespace ck_tile { + +class PrintTupleTest : public PrintTest +{ +}; + +TEST_F(PrintTupleTest, PrintSimpleTuple) +{ + // Test printing tuple with numbers + auto tup = make_tuple(number<1>{}, number<5>{}, number<8>{}); + + std::string output = CapturePrintOutput(tup); + + // Verify the output format matches tuple print implementation + EXPECT_TRUE(output.find("tuple<") == 0); + EXPECT_TRUE(output.find("1") != std::string::npos); + EXPECT_TRUE(output.find("5") != std::string::npos); + EXPECT_TRUE(output.find("8") != std::string::npos); + EXPECT_TRUE(output.back() == '>'); +} + +TEST_F(PrintTupleTest, PrintSingleElementTuple) +{ + // Test printing tuple with single element + auto tup = make_tuple(number<42>{}); + + std::string output = CapturePrintOutput(tup); + + EXPECT_TRUE(output.find("tuple<") == 0); + EXPECT_TRUE(output.find("42") != std::string::npos); + EXPECT_TRUE(output.back() == '>'); +} + +TEST_F(PrintTupleTest, PrintEmptyTuple) +{ + // Test printing empty tuple + auto tup = make_tuple(); + + std::string output = CapturePrintOutput(tup); + + EXPECT_EQ(output, "tuple<>"); +} + +TEST_F(PrintTupleTest, PrintMixedTypeTuple) +{ + // Test printing tuple with mixed types (numbers and constants) + auto tup = make_tuple(number<10>{}, constant<20>{}, number<30>{}); + + std::string output = CapturePrintOutput(tup); + + EXPECT_TRUE(output.find("tuple<") == 0); + EXPECT_TRUE(output.find("10") != std::string::npos); + EXPECT_TRUE(output.find("20") != std::string::npos); + EXPECT_TRUE(output.find("30") != std::string::npos); + EXPECT_TRUE(output.back() == '>'); +} + +} // namespace ck_tile From b0a97498b0965d1b33cf90d117f9783989ef9ccb Mon Sep 17 00:00:00 2001 From: Yi DING Date: Thu, 7 Aug 2025 21:24:43 +0800 Subject: [PATCH 18/21] [CK_TILE] FMHA BWD Remove Unnecessary Padding (#2550) * Remove unnecessary pssk * Add BlockFmhaBwdDQDKDVPipeline wrapper * Resolve copilot comments & Remove kpad & fix * Remove spad --- .../ck_tile/01_fmha/codegen/ops/fmha_bwd.py | 171 +++++++----------- example/ck_tile/01_fmha/codegen/utils.py | 21 +++ example/ck_tile/01_fmha/fmha_bwd.hpp | 28 ++- .../ck_tile/core/tensor/null_tile_window.hpp | 7 +- include/ck_tile/ops/fmha.hpp | 2 +- .../ops/fmha/kernel/fmha_bwd_kernel.hpp | 54 ++---- ...k_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp | 6 +- ...a_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp | 7 +- ...ck_fmha_bwd_dq_dk_dv_pipeline_selector.hpp | 30 +++ .../pipeline/block_fmha_bwd_pipeline_enum.hpp | 15 -- .../block_fmha_bwd_pipeline_problem.hpp | 6 +- 11 files changed, 158 insertions(+), 189 deletions(-) create mode 100644 example/ck_tile/01_fmha/codegen/utils.py create mode 100644 include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp delete mode 100644 include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_enum.hpp diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py index 77b63a0c83..47cf6b3ad4 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: MIT -# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. # generate kernel instances to speed up compilation import copy @@ -8,21 +8,13 @@ import fnmatch import itertools from pathlib import Path from typing import List, Optional, Tuple, Dict, Literal +from collections import defaultdict from codegen.cmake_config import * from codegen.cpp_symbol_map import * +from codegen.utils import update_file -BWD_DQDKDV_PIPELINE_MAP = { - "kr_ktr_vr_iglp" : "ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP", - "kr_ktr_vr" : "ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR", -} - -BWD_DQDKDV_PIPELINE_ENUM_MAP = { - "kr_ktr_vr_iglp" : "ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP", - "kr_ktr_vr" : "ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR", -} - FMHA_BWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n // auto generated by generate.py @@ -56,8 +48,8 @@ using fmha_bwd_shape_{F_idx} = ck_tile::TileFmhaBwdShape; -using fmha_bwd_trait_{F_idx} = ck_tile::TileFmhaTraits<{F_spad}, - {F_skpad}, +using fmha_bwd_trait_{F_idx} = ck_tile::TileFmhaTraits; -using fmha_bwd_pipeline_{F_idx} = {F_pipeline}; +using fmha_bwd_pipeline_{F_idx} = ck_tile::BlockFmhaBwdDQDKDVPipeline; using fmha_bwd_dk_epilogue_{F_idx} = ck_tile::Default2DEpilogue< ck_tile::Default2DEpilogueProblem::AccDataType, typename FmhaBwdTypeConfig<{F_dtype}>::KGradDataType, - {F_skpad}, + false, {F_dpad}>>; using fmha_bwd_dv_epilogue_{F_idx} = ck_tile::Default2DEpilogue< ck_tile::Default2DEpilogueProblem::AccDataType, typename FmhaBwdTypeConfig<{F_dtype}>::VGradDataType, - {F_skpad}, + false, {F_dvpad}>>; using fmha_bwd_dq_dk_dv_kernel_{F_idx} = @@ -115,13 +107,10 @@ using fmha_bwd_dq_dk_dv_kernel_{F_idx} = using dq_dk_dv_trait_{F_idx} = fmha_bwd_dq_dk_dv_traits_<{F_hdim}, {F_dtype}, {F_mode}, - {F_pipeline_enum}, fmha_mask_{F_idx}, fmha_dropout_{F_idx}, {F_bias}, {F_dbias}, - {F_spad}, - {F_skpad}, {F_dpad}, {F_dvpad}, {F_deterministic}>; @@ -195,15 +184,18 @@ FMHA_BWD_API_PER_HDIM_CASE=""" {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v < """ FMHA_BWD_API_INNER_DISPATCH=""" {F_if}((t.is_group_mode == {F_mode}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_dbias == {F_dbias}) && ({F_dropout_check}) && - ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && (t.is_deterministic == {F_deterministic})) {{ - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1}, {F_dvpad}>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_pipeline_enum}, {F_mask}, {F_dropout}, {F_bias}, {F_dbias}, {F_spad0}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_deterministic}>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1}, {F_dpad}, {F_deterministic}>; + ({F_scheck}) && ({F_dcheck}) && ({F_dvcheck}) && (t.is_deterministic == {F_deterministic})) {{ + using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dvpad}>; + using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_mask}, {F_dropout}, {F_bias}, {F_dbias}, {F_dpad}, {F_dvpad}, {F_deterministic}>; + using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dpad}, {F_deterministic}>; r = fmha_bwd_(s, a); return r; }} """ +# M0 size for 1d kernels (dot/convert) +M0_1D = 64 + # GEMM0: Q@K=S^T # GEMM1: P^T@dO^T=dV(This was chosen as G1 to match fwd, but N1 must be equal to headdim_v) # GEMM2: dO@V=dP^T(This was chosen as G2 because of the calculation order) @@ -249,8 +241,6 @@ class FmhaBwdDQDKDVKernel: F_hdim : int # hdim F_dtype : str # data type F_tile : FmhaBwdDQDKDVTileSize - F_spad : str # true/false - F_skpad : str # F_dpad : str # F_dvpad : str # F_bias : str # @@ -259,7 +249,6 @@ class FmhaBwdDQDKDVKernel: F_mask : str # value from MASK_MAP F_mode : str # value from MODE_MAP F_deterministic : str # - F_pipeline : str # mask_impl : str # @property @@ -293,8 +282,6 @@ class FmhaBwdDQDKDVKernel: F_wm1 = self.F_tile.F_wm1, F_wn1 = self.F_tile.F_wn1, F_wk1 = self.F_tile.F_wk1, - F_spad = BOOL_MAP[self.F_spad], - F_skpad = BOOL_MAP[self.F_skpad], F_dpad = BOOL_MAP[self.F_dpad], F_dvpad = BOOL_MAP[self.F_dvpad], F_bias = BIAS_MAP[self.F_bias], @@ -304,21 +291,18 @@ class FmhaBwdDQDKDVKernel: F_mask = get_mask_map(self.mask_impl)[self.F_mask], F_mode = MODE_MAP[self.F_mode], F_deterministic = BOOL_MAP[self.F_deterministic], - F_pipeline_enum = BWD_DQDKDV_PIPELINE_ENUM_MAP[self.F_pipeline], - F_pipeline = BWD_DQDKDV_PIPELINE_MAP[self.F_pipeline]) + ) @property def name(self) -> str: def pad_name() -> str: n = '' - if self.F_spad == 't': n += 's' - if self.F_skpad == 't' : n += 'sk' if self.F_dpad == 't' : n += 'd' if self.F_dvpad == 't' : n += 'dv' if n != '' : n = 'p' + n return n pn = pad_name() - n = f"fmha_bwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + self.F_tile.name + f'_{self.F_pipeline}' + n = f"fmha_bwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + self.F_tile.name if pn != '' : n += f'_{pn}' else: n += '_npad' @@ -347,20 +331,15 @@ class FmhaBwdDQDKDVKernel: return self.name + ".cpp" # TODO: design a more practical way to do it -# this is current supported tile size & pipeline. +# this is current supported tile size. def get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype : str) -> Optional[dict]: if dtype == 'fp16' or dtype == 'bf16': return { - '32' : [FmhaBwdDQDKDVTileSize( 32, 128, 32, 32, 32, 32, 64, 32, 32, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1), - "kr_ktr_vr_iglp", "kr_ktr_vr"], - '64' : [FmhaBwdDQDKDVTileSize( 32, 128, 64, 32, 64, 32, 32, 64, 64, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1), - "kr_ktr_vr_iglp", "kr_ktr_vr"], - '128' : [FmhaBwdDQDKDVTileSize( 16, 128, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1), - "kr_ktr_vr_iglp", "kr_ktr_vr"], - # '160' : [FmhaBwdDQDKDVTileSize( 32, 64, 160, 32, 160, 32, 32, 160, 160, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1), - # "kr_ktr_vr_iglp", "kr_ktr_vr"], - '256' : [FmhaBwdDQDKDVTileSize( 16, 64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1), - "kr_ktr_vr_iglp", "kr_ktr_vr"] + '32' : FmhaBwdDQDKDVTileSize( 32, 128, 32, 32, 32, 32, 64, 32, 32, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1), + '64' : FmhaBwdDQDKDVTileSize( 32, 128, 64, 32, 64, 32, 32, 64, 64, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1), + '128' : FmhaBwdDQDKDVTileSize( 16, 128, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1), + # '160' : FmhaBwdDQDKDVTileSize( 32, 64, 160, 32, 160, 32, 32, 160, 160, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1), + '256' : FmhaBwdDQDKDVTileSize( 16, 64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1), } else: return None @@ -375,7 +354,7 @@ using fmha_bwd_dot_do_o_pipeline_problem_{F_idx} = ck_tile::BlockFmhaBwdOGradDot typename FmhaBwdTypeConfig::ODataType, typename FmhaBwdTypeConfig::OGradDataType, typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, + /* BlockSize = M0 = */ 64, {F_hdim}, {F_mode}, fmha_bwd_dot_do_o_trait_{F_idx}>; @@ -580,7 +559,6 @@ class FmhaBwdConvertQGradKernel: @dataclass(frozen=True) class FmhaBwdApiTrait: idx : int # this is not a tunable, but a counter to differentiate symbol - pipeline : str # sync with fmha_bwd_traits<>, to generate fallback calls hdim : int dtype : str # data type @@ -590,9 +568,7 @@ class FmhaBwdApiTrait: bias : str dbias : str dropout : str - spad : str - spad1 : str # spad for dot/convert kernel - skpad : str + spad1d : str # spad for 1d kernels (dot/convert) dpad : str dvpad : str deterministic : str @@ -611,24 +587,14 @@ class FmhaBwdApiTrait: def bhdv(self) -> int: return self.tile.F_bhdv - def scheck(self, spad1 : str) -> str: - if self.mode == 'group': - return 'true' # always support - elif self.spad == 't' and spad1 == 't': - return f'a.seqlen_q % {self.bm0} != 0' - elif self.spad == 'f' and spad1 == 't': - return f'a.seqlen_q % {self.bm0} == 0 and a.seqlen_q % 64 != 0' - else: # self.skpad == 'f' and skpad1 == 'f' - return 'a.seqlen_q % 64 == 0' - @property - def skcheck(self) -> str: + def scheck(self) -> str: if self.mode == 'group': return 'true' # always support - elif self.skpad == 't': - return f'a.seqlen_k % {self.bn0} != 0' - else: - return f'a.seqlen_k % {self.bn0} == 0' + elif self.spad1d == 't': + return f'a.seqlen_q % {M0_1D} != 0' + else: # self.spad1d == 'f' + return f'a.seqlen_q % {M0_1D} == 0' @property def dcheck(self) -> str: @@ -647,14 +613,14 @@ class FmhaBwdApiTrait: def get_occupancy(dtype, hdim): return 2 - return FmhaBwdOGradDotOKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_spad=self.spad1, + return FmhaBwdOGradDotOKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_spad=self.spad1d, F_dvpad=self.dvpad, F_mode=self.mode, F_occupancy=get_occupancy(self.dtype, self.hdim)) @property def dq_dk_dv_kernel(self) -> FmhaBwdDQDKDVKernel: return FmhaBwdDQDKDVKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_tile=self.tile, - F_spad=self.spad, F_skpad=self.skpad, F_dpad=self.dpad, F_dvpad=self.dvpad, F_bias=self.bias, - F_dbias=self.dbias, F_dropout=self.dropout, F_mask=self.mask, F_mode=self.mode, F_deterministic=self.deterministic, F_pipeline=self.pipeline, mask_impl=self.mask_impl) + F_dpad=self.dpad, F_dvpad=self.dvpad, F_bias=self.bias, F_dbias=self.dbias, F_dropout=self.dropout, + F_mask=self.mask, F_mode=self.mode, F_deterministic=self.deterministic, mask_impl=self.mask_impl) @property def convert_dq_kernel(self) -> FmhaBwdConvertQGradKernel: @@ -664,48 +630,46 @@ class FmhaBwdApiTrait: return 2 return FmhaBwdConvertQGradKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, - F_bm0=64, F_bn0=self.tile.F_bn0, F_spad=self.spad, F_dpad=self.dpad, + F_bm0=M0_1D, F_bn0=self.tile.F_bn0, F_spad=self.spad1d, F_dpad=self.dpad, F_mode=self.mode, F_occupancy=get_occupancy(self.dtype, self.hdim), F_deterministic=self.deterministic) class FmhaBwdApiPool: def __init__(self, mask_impl): - self.dq_dk_dv_pool = dict() + self.dq_dk_dv_pool = defaultdict(lambda: defaultdict(list)) self.mask_impl = mask_impl def register_dq_dk_dv_traits(self, trait : FmhaBwdApiTrait) -> None: # TODO: do we need to check duplication? - if trait.dtype not in self.dq_dk_dv_pool.keys(): - self.dq_dk_dv_pool[trait.dtype] = dict() - if trait.hdim not in self.dq_dk_dv_pool[trait.dtype].keys(): - self.dq_dk_dv_pool[trait.dtype][trait.hdim] = list() - self.dq_dk_dv_pool[trait.dtype][trait.hdim].append(copy.copy(trait)) + @staticmethod + def if_(i: int) -> str: + return 'if' if i == 0 else 'else if' + + def _api_innders(self, traits: List[FmhaBwdApiTrait]) -> str: + inners = "" + i = 0 + for trait in traits: + inners += FMHA_BWD_API_INNER_DISPATCH.format(F_if=self.if_(i), F_mode=MODE_MAP[trait.mode], + F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], + F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout], + F_scheck=trait.scheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=trait.hdim, F_dtype=BWD_DTYPE_MAP[trait.dtype], + F_spad1d=BOOL_MAP[trait.spad1d], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad], + F_deterministic=BOOL_MAP[trait.deterministic]) + i += 1 + return inners + @property def api(self) -> str: per_dtypes=str() - for i, dtype in enumerate(self.dq_dk_dv_pool.keys()): + for i, dtype in enumerate(self.dq_dk_dv_pool): per_hdim_case=str() - for j, hdim in enumerate(self.dq_dk_dv_pool[dtype].keys()): + for j, hdim in enumerate(self.dq_dk_dv_pool[dtype]): traits=self.dq_dk_dv_pool[dtype][hdim] - inners=str() - for k, trait in enumerate(traits): - if_k = 'if' if k == 0 else 'else if' - for spad1 in ["t", "f"]: - if (spad1 == "f" and (trait.spad == "t" or trait.mode == "group")): - continue - inners = inners + FMHA_BWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_pipeline_enum=BWD_DQDKDV_PIPELINE_ENUM_MAP[trait.pipeline], - F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], - F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout], - F_scheck=trait.scheck(spad1=spad1), F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=hdim, F_dtype=BWD_DTYPE_MAP[dtype], - F_spad0=BOOL_MAP[trait.spad], F_spad1=BOOL_MAP[spad1], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad], - F_deterministic=BOOL_MAP[trait.deterministic]) - - if_j = 'if' if j == 0 else 'else if' - per_hdim_case = per_hdim_case + FMHA_BWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners) - if_i = 'if' if i == 0 else 'else if' - per_dtypes = per_dtypes + FMHA_BWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case) + inners = self._api_innders(traits) + per_hdim_case = per_hdim_case + FMHA_BWD_API_PER_HDIM_CASE.format(F_if=self.if_(j), F_hdim=hdim, F_inner_dispatch=inners) + per_dtypes += FMHA_BWD_API_PER_DTYPE.format(F_if=self.if_(i), F_dtype=dtype, F_hdim_case=per_hdim_case) if not per_dtypes: # empty string we add some ignore to suppress warning in api per_dtypes += ' (void)t ; (void)s ; (void)a;' @@ -730,21 +694,16 @@ def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[Fm d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype) if d is None: continue - for hdim_str, mode, mask, bias, dbias, dropout, spad, spad1, skpad, dpad, dvpad, deterministic in itertools.product(d.keys(), MODE_MAP.keys(), get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], DROPOUT_MAP.keys(), *([["t", "f"]] * 6)): - tile = d[hdim_str][0] - ppl = d[hdim_str][1] + for hdim_str, mode, mask, bias, dbias, dropout, spad1d, dpad, dvpad, deterministic in itertools.product(d.keys(), MODE_MAP.keys(), get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], DROPOUT_MAP.keys(), *([["t", "f"]] * 4)): + tile = d[hdim_str] hdim = int(hdim_str) - if (mode == "group") and (spad == "f" or skpad == "f"): - continue - if (spad1 == "f") and (spad == "t" or mode == "group"): + if (mode == "group") and (spad1d == "f"): continue if ((bias == "no" or bias == "alibi") and dbias == "t"): continue if ("wg32" in dropout): continue - if (dpad == "t" or dvpad == "t"): - ppl = d[hdim_str][2] - t = FmhaBwdApiTrait(idx=0, pipeline=ppl, hdim=hdim, dtype=dtype, mode=mode,tile=tile,mask=mask, bias=bias, dbias=dbias, dropout=dropout, spad=spad, spad1=spad1, skpad=skpad, dpad=dpad, dvpad=dvpad, deterministic=deterministic, mask_impl=mask_impl) + t = FmhaBwdApiTrait(idx=0, hdim=hdim, dtype=dtype, mode=mode,tile=tile,mask=mask, bias=bias, dbias=dbias, dropout=dropout, spad1d=spad1d, dpad=dpad, dvpad=dvpad, deterministic=deterministic, mask_impl=mask_impl) if not fnmatch.fnmatch(t.dot_do_o_kernel.name, filter_dot_do_o): continue @@ -808,13 +767,13 @@ def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[Fm def write_blobs(output_dir : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None: api_pool, kernels_dot_do_o, kernels_dq_dk_dv, kernels_convert_dq = get_bwd_blobs(filter_list, receipt, mask_impl, optdim_list) - (output_dir / FMHA_BWD_API_FILENAME).write_text(api_pool.api) + update_file(output_dir / FMHA_BWD_API_FILENAME, api_pool.api) for k in kernels_dot_do_o: - (output_dir / k.filename).write_text(k.template) + update_file(output_dir / k.filename, k.template) for k in kernels_convert_dq: - (output_dir / k.filename).write_text(k.template) + update_file(output_dir / k.filename, k.template) for k in kernels_dq_dk_dv: - (output_dir / k.filename).write_text(k.template) + update_file(output_dir / k.filename, k.template) def list_blobs(file_path: Path, filter_list: str, receipt, optdim_list, mask_impl) -> None: diff --git a/example/ck_tile/01_fmha/codegen/utils.py b/example/ck_tile/01_fmha/codegen/utils.py new file mode 100644 index 0000000000..e3bbb18c42 --- /dev/null +++ b/example/ck_tile/01_fmha/codegen/utils.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +# generate kernel instances to speed up compilation + +import os.path as path + + +def update_file(file_path, content): + """Update the file at file_path with the given content if it differs from the existing content. + + It avoids unnecessary touching of the file which triggers rebuilds + """ + + existing_content = "" + if path.exists(file_path): + with open(file_path, "r") as file: + existing_content = file.read() + if existing_content == content: + return + with open(file_path, "w") as file: + file.write(content) diff --git a/example/ck_tile/01_fmha/fmha_bwd.hpp b/example/ck_tile/01_fmha/fmha_bwd.hpp index 9179dbd9be..c999cf750e 100644 --- a/example/ck_tile/01_fmha/fmha_bwd.hpp +++ b/example/ck_tile/01_fmha/fmha_bwd.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -357,31 +357,25 @@ auto fmha_bwd_convert_dq_create_kargs_and_grids(fmha_bwd_args args) template struct fmha_bwd_dq_dk_dv_traits_ { - static constexpr ck_tile::index_t HDim = HDim_; - using DataType = ck_tile::remove_cvref_t; - static constexpr bool kIsGroupMode = kIsGroupMode_; - static constexpr auto FmhaBwdPipelineEnum = FmhaBwdPipelineEnum_; - using FmhaMask = ck_tile::remove_cvref_t; - using FmhaDropout = ck_tile::remove_cvref_t; - static constexpr auto BiasEnum = BiasEnum_; - static constexpr bool kHasBiasGrad = kHasBiasGrad_; - static constexpr bool kPadS = kPadS_; - static constexpr bool kPadSK = kPadSK_; - static constexpr bool kPadD = kPadD_; - static constexpr bool kPadDv = kPadDv_; - static constexpr bool kIsDeterministic = kIsDeterministic_; + static constexpr ck_tile::index_t HDim = HDim_; + using DataType = ck_tile::remove_cvref_t; + static constexpr bool kIsGroupMode = kIsGroupMode_; + using FmhaMask = ck_tile::remove_cvref_t; + using FmhaDropout = ck_tile::remove_cvref_t; + static constexpr auto BiasEnum = BiasEnum_; + static constexpr bool kHasBiasGrad = kHasBiasGrad_; + static constexpr bool kPadD = kPadD_; + static constexpr bool kPadDv = kPadDv_; + static constexpr bool kIsDeterministic = kIsDeterministic_; }; template diff --git a/include/ck_tile/core/tensor/null_tile_window.hpp b/include/ck_tile/core/tensor/null_tile_window.hpp index de99be1965..f7eca73afb 100644 --- a/include/ck_tile/core/tensor/null_tile_window.hpp +++ b/include/ck_tile/core/tensor/null_tile_window.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -53,10 +53,13 @@ struct is_null_tile_window> : public std::true_type }; } // namespace impl +template +constexpr bool is_null_tile_window_v = impl::is_null_tile_window>::value; + template CK_TILE_DEVICE constexpr auto is_null_tile_window(const T&) { - return impl::is_null_tile_window>::value; + return is_null_tile_window_v>; } template diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp index 30bea193b7..313de5f29a 100644 --- a/include/ck_tile/ops/fmha.hpp +++ b/include/ck_tile/ops/fmha.hpp @@ -24,8 +24,8 @@ #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dot_do_o.hpp" #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp" #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp" +#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp" #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp" -#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_enum.hpp" #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp" #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_appendkv_pipeline.hpp" #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_appendkv_pipeline_default_policy.hpp" diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp index ce3bf8fe8d..8b184b18f3 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp @@ -52,8 +52,6 @@ struct FmhaBwdDQDKDVKernel using BiasGradDataType = ck_tile::remove_cvref_t; static constexpr bool kIsGroupMode = FmhaPipeline::kIsGroupMode; - static constexpr bool kPadSeqLenQ = FmhaPipeline::kPadSeqLenQ; - static constexpr bool kPadSeqLenK = FmhaPipeline::kPadSeqLenK; static constexpr bool kPadHeadDimQ = FmhaPipeline::kPadHeadDimQ; static constexpr bool kPadHeadDimV = FmhaPipeline::kPadHeadDimV; static constexpr auto BiasEnum = FmhaPipeline::BiasEnum; @@ -85,8 +83,6 @@ struct FmhaBwdDQDKDVKernel #define _TS_ std::to_string auto pn = [&] () { std::string n; - if (kPadSeqLenQ) n += "s"; - if (kPadSeqLenK) n += "sk"; if (kPadHeadDimQ) n += "d"; if (kPadHeadDimV) n += "dv"; return n.empty() ? n : std::string("p") + n; }(); @@ -100,7 +96,7 @@ struct FmhaBwdDQDKDVKernel "r" + _TS_(gbr4::at(ck_tile::number<0>{})) + "x" + _TS_(gbr4::at(ck_tile::number<1>{})) + "x" + _TS_(gbr4::at(ck_tile::number<2>{})) + "_" + "w" + _TS_(gwt0::at(ck_tile::number<0>{})) + "x" + _TS_(gwt0::at(ck_tile::number<1>{})) + "x" + _TS_(gwt0::at(ck_tile::number<2>{})) + "_" + "w" + _TS_(gwt1::at(ck_tile::number<0>{})) + "x" + _TS_(gwt1::at(ck_tile::number<1>{})) + "x" + _TS_(gwt1::at(ck_tile::number<2>{})) + "_" + - ("o" + _TS_(kBlockPerCu) + "_") + _SS_(FmhaPipeline::name) + (pn.empty() ? "_npad" : "_" + pn) + + ("o" + _TS_(kBlockPerCu)) + (pn.empty() ? "_npad" : "_" + pn) + (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr::name)) + (kHasBiasGrad ? "_dbias" : "_ndbias") + (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kHasDropout ? "_dropout" : "_ndropout" ) + (kIsStoreRandval ? "_storerandval" : "" ) + (kIsDeterministic ? "_deterministic" : "_ndeterministic" ); @@ -1221,7 +1217,7 @@ struct FmhaBwdDQDKDVKernel const auto q_dram = pad_tensor_view( q_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); const auto k_dram_naive = make_naive_tensor_view( k_ptr, @@ -1232,7 +1228,7 @@ struct FmhaBwdDQDKDVKernel const auto k_dram = pad_tensor_view( k_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); const auto v_dram = [&]() { const auto v_dram_naive = make_naive_tensor_view( @@ -1244,22 +1240,15 @@ struct FmhaBwdDQDKDVKernel return pad_tensor_view( v_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); }(); - const auto lse_dram = [&]() { - const auto lse_dram_naive = make_naive_tensor_view_packed( - lse_ptr, make_tuple(kargs.seqlen_q), number<1>{}); - return pad_tensor_view( - lse_dram_naive, make_tuple(number{}), sequence{}); - }(); + // lse and d should be fine to read unpaded data as they are not on the reduction dimension + const auto lse_dram = make_naive_tensor_view_packed( + lse_ptr, make_tuple(kargs.seqlen_q), number{}); - const auto d_dram = [&]() { - const auto d_dram_naive = make_naive_tensor_view_packed( - d_ptr, make_tuple(kargs.seqlen_q), number<1>{}); - return pad_tensor_view( - d_dram_naive, make_tuple(number{}), sequence{}); - }(); + const auto d_dram = make_naive_tensor_view_packed( + d_ptr, make_tuple(kargs.seqlen_q), number{}); const auto do_dram_naive = make_naive_tensor_view( do_ptr, @@ -1270,7 +1259,7 @@ struct FmhaBwdDQDKDVKernel const auto do_dram = pad_tensor_view( do_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); auto q_dram_window = make_tile_window( q_dram, @@ -1313,7 +1302,7 @@ struct FmhaBwdDQDKDVKernel return pad_tensor_view( dq_acc_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); }(); return make_tile_window( @@ -1341,7 +1330,7 @@ struct FmhaBwdDQDKDVKernel return pad_tensor_view( dq_acc_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); }(); return make_tile_window( @@ -1376,9 +1365,8 @@ struct FmhaBwdDQDKDVKernel number{}, number<1>{}); - return pad_tensor_view(bias_dram_naive, - bias_dram_window_lengths, - sequence{}); + return pad_tensor_view( + bias_dram_naive, bias_dram_window_lengths, sequence{}); }(); return make_tile_window(bias_dram, bias_dram_window_lengths, {0, i_n0}); @@ -1406,9 +1394,8 @@ struct FmhaBwdDQDKDVKernel number{}, number<1>{}); - return pad_tensor_view(dbias_dram_naive, - bias_dram_window_lengths, - sequence{}); + return pad_tensor_view( + dbias_dram_naive, bias_dram_window_lengths, sequence{}); }(); return make_tile_window(dbias_dram, bias_dram_window_lengths, {0, i_n0}); @@ -1495,9 +1482,8 @@ struct FmhaBwdDQDKDVKernel number<1>{}, number<1>{}); - return pad_tensor_view(randval_dram_naive, - randval_dram_window_lengths, - sequence{}); + return pad_tensor_view( + randval_dram_naive, randval_dram_window_lengths, sequence{}); }(); return make_tile_window(randval_dram, randval_dram_window_lengths, {0, i_n0}); @@ -1550,7 +1536,7 @@ struct FmhaBwdDQDKDVKernel return pad_tensor_view( dk_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); }(); auto dv_dram = [&]() { @@ -1564,7 +1550,7 @@ struct FmhaBwdDQDKDVKernel return pad_tensor_view( dv_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); }(); auto dk_dram_window = make_tile_window( diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp index 8a13c0b060..1f11569533 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp @@ -49,8 +49,6 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR static constexpr index_t kVHeaddim = BlockFmhaShape::kVHeaddim; static constexpr bool kIsGroupMode = Problem::kIsGroupMode; - static constexpr bool kPadSeqLenQ = Problem::kPadSeqLenQ; - static constexpr bool kPadSeqLenK = Problem::kPadSeqLenK; static constexpr bool kPadHeadDimQ = Problem::kPadHeadDimQ; static constexpr bool kPadHeadDimV = Problem::kPadHeadDimV; static constexpr auto BiasEnum = Problem::BiasEnum; @@ -72,8 +70,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR kPadHeadDimQ ? 1 : Policy::template GetAlignmentKGrad(); static constexpr index_t kAlignmentVGrad = kPadHeadDimV ? 1 : Policy::template GetAlignmentVGrad(); - static constexpr index_t kAlignmentBias = - kPadSeqLenK ? 1 : Policy::template GetTransposedAlignmentBias(); + static constexpr index_t kAlignmentBias = 1; static constexpr const char* name = "kr_ktr_vr"; @@ -554,7 +551,6 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR }); } - if constexpr(kPadSeqLenK || FmhaMask::IsMasking) { bool need_perpixel_check = mask.IsEdgeTile( seqlen_q_step, k_origin.at(number<0>{}), number{}, number{}); diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp index c88b058d32..967fe2362d 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp @@ -49,8 +49,6 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP static constexpr index_t kVHeaddim = BlockFmhaShape::kVHeaddim; static constexpr bool kIsGroupMode = Problem::kIsGroupMode; - static constexpr bool kPadSeqLenQ = Problem::kPadSeqLenQ; - static constexpr bool kPadSeqLenK = Problem::kPadSeqLenK; static constexpr bool kPadHeadDimQ = Problem::kPadHeadDimQ; static constexpr bool kPadHeadDimV = Problem::kPadHeadDimV; static constexpr auto BiasEnum = Problem::BiasEnum; @@ -72,8 +70,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP kPadHeadDimQ ? 1 : Policy::template GetAlignmentKGrad(); static constexpr index_t kAlignmentVGrad = kPadHeadDimV ? 1 : Policy::template GetAlignmentVGrad(); - static constexpr index_t kAlignmentBias = - kPadSeqLenK ? 1 : Policy::template GetTransposedAlignmentBias(); + static constexpr index_t kAlignmentBias = 1; static constexpr const char* name = "kr_ktr_vr_iglp"; @@ -590,7 +587,6 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP }); } - if constexpr(kPadSeqLenK || FmhaMask::IsMasking) { bool need_perpixel_check = mask.IsEdgeTile( seqlen_q_step, k_origin.at(number<0>{}), number{}, number{}); @@ -849,7 +845,6 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP }); } - if constexpr(kPadSeqLenK || FmhaMask::IsMasking) { bool need_perpixel_check = mask.IsEdgeTile( seqlen_q_step, k_origin.at(number<0>{}), number{}, number{}); diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp new file mode 100644 index 0000000000..80c311de86 --- /dev/null +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp" +#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp" + +namespace ck_tile { + +template +class BlockFmhaBwdDQDKDVPipelineSelector +{ + static constexpr bool has_dpad = Problem::Traits::kPadHeadDimQ || Problem::Traits::kPadHeadDimV; + + public: + using type = std::conditional_t, + BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP>; +}; + +template +class BlockFmhaBwdDQDKDVPipeline : public BlockFmhaBwdDQDKDVPipelineSelector::type +{ + public: + static constexpr const char* name = "auto"; +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_enum.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_enum.hpp deleted file mode 100644 index 27f58ef2f8..0000000000 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_enum.hpp +++ /dev/null @@ -1,15 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -namespace ck_tile { - -// This class is used for codegen pattern matching -enum class BlockFmhaBwdPipelineEnum -{ - KRKTRVR_IGLP = 0, - KRKTRVR, -}; - -} // namespace ck_tile diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp index c4c4a745a7..f6c79c7db6 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -55,13 +55,13 @@ struct BlockFmhaBwdPipelineProblem static constexpr bool kIsDeterministic = kIsDeterministic_; // attributes from traits - static constexpr bool kPadSeqLenQ = Traits::kPadSeqLenQ; - static constexpr bool kPadSeqLenK = Traits::kPadSeqLenK; static constexpr bool kPadHeadDimQ = Traits::kPadHeadDimQ; static constexpr bool kPadHeadDimV = Traits::kPadHeadDimV; static constexpr auto BiasEnum = Traits::BiasEnum; static constexpr bool kHasBiasGrad = Traits::kHasBiasGrad; static constexpr index_t kBlockPerCu = Traits::kBlockPerCu; + static_assert(!Traits::kPadSeqLenQ, "BlockFmhaBwdPipelineProblem does not need kPadSeqLenQ"); + static_assert(!Traits::kPadSeqLenK, "BlockFmhaBwdPipelineProblem does not need kPadSeqLenQ"); }; template Date: Thu, 7 Aug 2025 21:37:28 +0800 Subject: [PATCH 19/21] Add e8m0 scaled convert into CK_TILE (#2617) * first commit * remove redundent code * modify according to comments. * fix type_convert error with scaled_type_convert --- include/ck_tile/core.hpp | 1 + include/ck_tile/core/numeric/e8m0.hpp | 102 +++++++++++ include/ck_tile/core/numeric/mxfp_convert.hpp | 27 +-- include/ck_tile/core/numeric/pk_fp4.hpp | 163 +++++++++++------- include/ck_tile/core/numeric/type_convert.hpp | 41 +++-- include/ck_tile/host/host_tensor.hpp | 8 +- test/ck_tile/data_type/CMakeLists.txt | 1 + test/ck_tile/data_type/test_mx_scale.cpp | 162 +++++++++++++++++ 8 files changed, 419 insertions(+), 86 deletions(-) create mode 100644 include/ck_tile/core/numeric/e8m0.hpp create mode 100644 test/ck_tile/data_type/test_mx_scale.cpp diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp index c8945f03e9..9f3c996873 100644 --- a/include/ck_tile/core.hpp +++ b/include/ck_tile/core.hpp @@ -27,6 +27,7 @@ #include "ck_tile/core/container/thread_buffer.hpp" #include "ck_tile/core/container/tuple.hpp" #include "ck_tile/core/numeric/bfloat16.hpp" +#include "ck_tile/core/numeric/e8m0.hpp" #include "ck_tile/core/numeric/float8.hpp" #include "ck_tile/core/numeric/half.hpp" #include "ck_tile/core/numeric/int8.hpp" diff --git a/include/ck_tile/core/numeric/e8m0.hpp b/include/ck_tile/core/numeric/e8m0.hpp new file mode 100644 index 0000000000..ea94880f27 --- /dev/null +++ b/include/ck_tile/core/numeric/e8m0.hpp @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core/config.hpp" +#include "ck_tile/core/numeric/mxfp_convert.hpp" + +namespace ck_tile { + +/** + * @brief Unsigned representation of a conventional biased Float32 exponent. + * + * bias = 127; + * + * E8M0_1 = 0b01111111; => 2^(127-127) = 1 + * E8M0_2 = 0b10000000; => 2^(128-127) = 2^1 = 2 + * E8M0_3 = 0b10000010; => 2^(130-127) = 2^3 = 8 + * E8M0_135 = 0b10000111; => 2^(135-127) = 2^8 = 256 + * E8M0_142 = 0b10001110; => 2^(142-127) = 2^15 = 32768 + * E8M0_MIN = 0b00000000; => 2^-127 + * E8M0_MAX = 0b11111110; => 2^127 + * E8M0_NAN = 0b11111111; => NaN + */ + +struct e8m0_bexp_t +{ + using raw_type = uint8_t; + using type = raw_type; + + raw_type data; + + CK_TILE_HOST_DEVICE constexpr e8m0_bexp_t() : data{type{0b11111111}} {} + CK_TILE_HOST_DEVICE explicit constexpr e8m0_bexp_t(type init) : data{init} {} + CK_TILE_HOST_DEVICE explicit constexpr e8m0_bexp_t(float scale) + : e8m0_bexp_t(static_cast(numeric_utils::get_exponent(scale))) + { + } + CK_TILE_HOST_DEVICE constexpr operator type() const { return data; } + CK_TILE_HOST_DEVICE constexpr raw_type& get() { return data; } + CK_TILE_HOST_DEVICE constexpr raw_type get() const { return data; } + CK_TILE_HOST_DEVICE constexpr operator float() const; + + constexpr bool operator==(const e8m0_bexp_t& other) const { return data == other.data; } + + constexpr bool operator!=(const e8m0_bexp_t& other) const { return data != other.data; } +}; + +using e8m0_t = e8m0_bexp_t; +using e8m0_raw_t = typename e8m0_t::raw_type; + +template <> +struct numeric_traits +{ + using bitwise_type = e8m0_raw_t; + + static constexpr int exp = 8; + static constexpr int mant = 0; + static constexpr int bias = 127; + static constexpr int PackedSize = 1; +}; + +// limits +template +struct numeric; + +template <> +struct numeric +{ + static constexpr e8m0_raw_t binary_min = 0b00000000; // 2^-127 + static constexpr e8m0_raw_t binary_max = 0b11111110; // 2^127 + static constexpr e8m0_raw_t binary_nan = 0b11111111; + CK_TILE_HOST_DEVICE static constexpr e8m0_t min() { return e8m0_t{binary_min}; } + CK_TILE_HOST_DEVICE static constexpr e8m0_t max() { return e8m0_t{binary_max}; } + CK_TILE_HOST_DEVICE static constexpr e8m0_t quiet_NaN() { return e8m0_t{binary_nan}; } + CK_TILE_HOST_DEVICE static constexpr e8m0_t signaling_NaN() { return e8m0_t{binary_nan}; } + CK_TILE_HOST_DEVICE static constexpr bool has_inf() { return false; } + + CK_TILE_HOST_DEVICE static constexpr e8m0_t epsilon() { return signaling_NaN(); } + CK_TILE_HOST_DEVICE static constexpr e8m0_t round_error() { return signaling_NaN(); } + CK_TILE_HOST_DEVICE static constexpr e8m0_t zero() { return signaling_NaN(); } + CK_TILE_HOST_DEVICE static constexpr e8m0_t infinity() { return signaling_NaN(); } +}; + +CK_TILE_HOST_DEVICE constexpr e8m0_bexp_t::operator float() const +{ + using traits = numeric_traits; + if(data == numeric::binary_nan) + { + return traits::NaN; + } + else if(data == 0) + { + return std::numeric_limits::min(); + } + else + { + return bit_cast(static_cast(data) << traits::mant); + } +} + +} // namespace ck_tile diff --git a/include/ck_tile/core/numeric/mxfp_convert.hpp b/include/ck_tile/core/numeric/mxfp_convert.hpp index b2e138e880..9b378933d0 100644 --- a/include/ck_tile/core/numeric/mxfp_convert.hpp +++ b/include/ck_tile/core/numeric/mxfp_convert.hpp @@ -12,15 +12,19 @@ struct numeric_utils : numeric_traits using traits = numeric_traits; using _numeric = numeric; - using raw_type = typename T::raw_type; + using raw_type = typename traits::bitwise_type; static constexpr int exp_mask = (1 << traits::exp) - 1; - static constexpr int get_exponent(raw_type x) + static constexpr raw_type get_exponent(raw_type x) { // TODO: check if repeated calls are optimized. return (x >> traits::mant) & exp_mask; } + static constexpr raw_type get_exponent(const T& x) + { + return get_exponent(bit_cast(x)); + } static constexpr bool is_positive(raw_type x) { return (x >> (traits::exp + traits::mant)) == _numeric::binary_zero; @@ -33,7 +37,7 @@ struct numeric_utils : numeric_traits static constexpr double get_mantissa(raw_type x) { double mantissa = is_subnormal(x) ? 0.0f : 1.0f; - for(uint32_t i = 0; i < traits::mant; ++i) + for(raw_type i = 0; i < traits::mant; ++i) { mantissa += std::ldexp(static_cast(x & 0b1), -(traits::mant - i)); x >>= 1; @@ -43,22 +47,23 @@ struct numeric_utils : numeric_traits }; template -CK_TILE_HOST_DEVICE float convert_to_float(typename T::raw_type data, int scale_exp = 127) +CK_TILE_HOST_DEVICE float convert_to_float(typename T::raw_type data, float scale = 1.f) { - using utils = numeric_utils; - static constexpr int e8m0_bias = 127; // TODO: make it generic. - float sign = utils::is_positive(data) ? 1.0 : -1.0; - int exp = (utils::is_subnormal(data) ? 1 : utils::get_exponent(data)) - utils::bias; - float mant = utils::get_mantissa(data); + using utils = numeric_utils; + float sign = utils::is_positive(data) ? 1.0 : -1.0; + int exp = (utils::is_subnormal(data) ? 1 : utils::get_exponent(data)) - utils::bias; + float mant = utils::get_mantissa(data); - return std::ldexp(sign * mant, exp + scale_exp - e8m0_bias); + return std::ldexp(sign * mant * scale, exp); } template -CK_TILE_HOST_DEVICE typename T::raw_type convert_to_type(float value) +CK_TILE_HOST_DEVICE typename T::raw_type convert_to_type(float value, float scale = 1.f) { using bitwise_type = typename numeric_traits::bitwise_type; + value /= scale; + if(std::abs(value) > float(numeric::max())) { float max_value = numeric::max(); diff --git a/include/ck_tile/core/numeric/pk_fp4.hpp b/include/ck_tile/core/numeric/pk_fp4.hpp index 0dee750b69..a345cd1b75 100644 --- a/include/ck_tile/core/numeric/pk_fp4.hpp +++ b/include/ck_tile/core/numeric/pk_fp4.hpp @@ -23,14 +23,11 @@ using fp32x2_t = float __attribute__((ext_vector_type(2))); using fp16x2_t = _Float16 __attribute__((ext_vector_type(2))); using bf16x2_t = bf16_raw_t __attribute__((ext_vector_type(2))); -CK_TILE_HOST_DEVICE constexpr uint8_t float_to_e2m1(float); +CK_TILE_HOST_DEVICE constexpr uint8_t float_to_e2m1(float x, float scale = 1.f); // TODO: Add stochastic method struct pk_float4_e2m1_t { - static constexpr int exponent = 2; - static constexpr int mantissa = 1; - static constexpr int bias = 1; // TODO: Can we merge raw_type and type? using raw_type = uint8_t; using type = raw_type; @@ -41,18 +38,27 @@ struct pk_float4_e2m1_t CK_TILE_HOST_DEVICE constexpr pk_float4_e2m1_t(T init) : data{static_cast(init)} { } - CK_TILE_HOST_DEVICE explicit constexpr pk_float4_e2m1_t(float init) : data{float_to_e2m1(init)} + CK_TILE_HOST_DEVICE explicit constexpr pk_float4_e2m1_t(float init, float scale = 1.f) + : data{float_to_e2m1(init, scale)} { } CK_TILE_HOST_DEVICE constexpr operator type() const { return data; } CK_TILE_HOST_DEVICE constexpr raw_type& get() { return data; } CK_TILE_HOST_DEVICE constexpr raw_type get() const { return data; } - CK_TILE_HOST_DEVICE constexpr operator float() const; - CK_TILE_HOST_DEVICE constexpr operator fp32x2_t() const; - CK_TILE_HOST_DEVICE constexpr operator fp16_t() const; - CK_TILE_HOST_DEVICE constexpr operator fp16x2_t() const; - CK_TILE_HOST_DEVICE constexpr operator bf16_t() const; - CK_TILE_HOST_DEVICE constexpr operator bf16x2_t() const; + + CK_TILE_HOST_DEVICE constexpr float to_float(float scale = 1.f) const; + CK_TILE_HOST_DEVICE constexpr fp32x2_t to_fp32x2(float scale = 1.f) const; + CK_TILE_HOST_DEVICE constexpr fp16_t to_fp16(float scale = 1.f) const; + CK_TILE_HOST_DEVICE constexpr fp16x2_t to_fp16x2(float scale = 1.f) const; + CK_TILE_HOST_DEVICE constexpr bf16_t to_bf16(float scale = 1.f) const; + CK_TILE_HOST_DEVICE constexpr bf16x2_t to_bf16x2(float scale = 1.f) const; + + CK_TILE_HOST_DEVICE constexpr operator float() const { return to_float(); } + CK_TILE_HOST_DEVICE constexpr operator fp32x2_t() const { return to_fp32x2(); } + CK_TILE_HOST_DEVICE constexpr operator fp16_t() const { return to_fp16(); } + CK_TILE_HOST_DEVICE constexpr operator fp16x2_t() const { return to_fp16x2(); } + CK_TILE_HOST_DEVICE constexpr operator bf16_t() const { return to_bf16(); } + CK_TILE_HOST_DEVICE constexpr operator bf16x2_t() const { return to_bf16x2(); } template CK_TILE_HOST_DEVICE constexpr raw_type unpack(number) const; @@ -191,131 +197,160 @@ CK_TILE_DEVICE pk_fp4_raw_t _to_f4(T src, float scale = 1.0f) } // namespace impl #endif -CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator bf16_t() const +CK_TILE_HOST_DEVICE constexpr bf16_t pk_fp4_t::to_bf16(float scale) const { #if CK_TILE_FP4_CVT_DEVICE - return impl::_from_f4(data); + return impl::_from_f4(data, scale); #else - return bf16_t{type_convert(convert_to_float(unpack(number<0>{})))}; + return bf16_t{type_convert(convert_to_float(unpack(number<0>{}), scale))}; #endif } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator bf16x2_t() const + +CK_TILE_HOST_DEVICE constexpr bf16x2_t pk_fp4_t::to_bf16x2(float scale) const { #if CK_TILE_FP4_CVT_DEVICE - return impl::_from_f4(data); + return impl::_from_f4(data, scale); #else - return bf16x2_t{type_convert(convert_to_float(unpack(number<0>{}))), - type_convert(convert_to_float(unpack(number<1>{})))}; + return bf16x2_t{type_convert(convert_to_float(unpack(number<0>{}), scale)), + type_convert(convert_to_float(unpack(number<1>{}), scale))}; #endif } // TODO: make float_to_e2m1 generic so that we can convert from directrly. -CK_TILE_HOST_DEVICE constexpr pk_fp4_raw_t float_to_e2m1(float x) +CK_TILE_HOST_DEVICE constexpr pk_fp4_raw_t float_to_e2m1(float x, float scale) { #if CK_TILE_FP4_CVT_DEVICE - return impl::_to_f4(x); + return impl::_to_f4(x, scale); #else - return convert_to_type(x); + return convert_to_type(x, scale); #endif } -CK_TILE_HOST_DEVICE constexpr fp32x2_t pk_fp4_to_fp32x2(const pk_fp4_t& x) { return fp32x2_t(x); } -CK_TILE_HOST_DEVICE constexpr fp16x2_t pk_fp4_to_fp16x2(const pk_fp4_t& x) { return fp16x2_t(x); } -CK_TILE_HOST_DEVICE constexpr bf16x2_t pk_fp4_to_bf16x2(const pk_fp4_t& x) { return bf16x2_t(x); } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t float_to_pk_fp4(const float& x) { return float_to_e2m1(x); } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16_to_pk_fp4(const fp16_t& x) +CK_TILE_HOST_DEVICE constexpr pk_fp4_t float_to_pk_fp4(const float& x, float scale) +{ + return float_to_e2m1(x, scale); +} +CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16_to_pk_fp4(const fp16_t& x, float scale) { #if CK_TILE_FP4_CVT_DEVICE - return impl::_to_f4(x); + return impl::_to_f4(x, scale); #else - return float_to_e2m1(type_convert(x)); + return float_to_e2m1(type_convert(x), scale); #endif } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16_to_pk_fp4(const bf16_t& x) +CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16_to_pk_fp4(const bf16_t& x, float scale) { #if CK_TILE_FP4_CVT_DEVICE - return impl::_to_f4(x); + return impl::_to_f4(x, scale); #else - return float_to_e2m1(type_convert(x)); + return float_to_e2m1(type_convert(x), scale); #endif } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16x2_to_pk_fp4(const fp16x2_t& x) +CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16x2_to_pk_fp4(const fp16x2_t& x, float scale) { #if CK_TILE_FP4_CVT_DEVICE - return impl::_to_f4(x); + return impl::_to_f4(x, scale); #else - return pk_fp4_t::pack(float_to_e2m1(type_convert(x[0])), - float_to_e2m1(type_convert(x[1]))); + return pk_fp4_t::pack(float_to_e2m1(type_convert(x[0]), scale), + float_to_e2m1(type_convert(x[1]), scale)); #endif } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16x2_to_pk_fp4(const bf16x2_t& x) +CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16x2_to_pk_fp4(const bf16x2_t& x, float scale) { #if CK_TILE_FP4_CVT_DEVICE - return impl::_to_f4(x); + return impl::_to_f4(x, scale); #else - return pk_fp4_t::pack(float_to_e2m1(type_convert(x[0])), - float_to_e2m1(type_convert(x[1]))); + return pk_fp4_t::pack(float_to_e2m1(type_convert(x[0]), scale), + float_to_e2m1(type_convert(x[1]), scale)); #endif } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp32x2_to_pk_fp4(const fp32x2_t& x) +CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp32x2_to_pk_fp4(const fp32x2_t& x, float scale) { #if CK_TILE_FP4_CVT_DEVICE - return impl::_to_f4(x); + return impl::_to_f4(x, scale); #else - return pk_fp4_t::pack(float_to_e2m1(x[0]), float_to_e2m1(x[1])); + return pk_fp4_t::pack(float_to_e2m1(x[0], scale), float_to_e2m1(x[1], scale)); #endif } +CK_TILE_HOST_DEVICE constexpr fp32x2_t pk_fp4_to_fp32x2(const pk_fp4_t& x, float scale) +{ + return x.to_fp32x2(scale); +} +CK_TILE_HOST_DEVICE constexpr fp16x2_t pk_fp4_to_fp16x2(const pk_fp4_t& x, float scale) +{ + return x.to_fp16x2(scale); +} +CK_TILE_HOST_DEVICE constexpr bf16x2_t pk_fp4_to_bf16x2(const pk_fp4_t& x, float scale) +{ + return x.to_bf16x2(scale); +} +CK_TILE_HOST_DEVICE constexpr float pk_fp4_to_float(const pk_fp4_t& x, float scale) +{ + return x.to_float(scale); +} +CK_TILE_HOST_DEVICE constexpr fp16_t pk_fp4_to_fp16(const pk_fp4_t& x, float scale) +{ + return x.to_fp16(scale); +} +CK_TILE_HOST_DEVICE constexpr bf16_t pk_fp4_to_bf16(const pk_fp4_t& x, float scale) +{ + return x.to_bf16(scale); +} + #if TEST_convert_with_table == 0 -CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator float() const +CK_TILE_HOST_DEVICE constexpr float pk_fp4_t::to_float(float scale) const { #if CK_TILE_FP4_CVT_DEVICE - return impl::_from_f4(data); + return impl::_from_f4(data, scale); #else - return convert_to_float(unpack(number<0>{})); + return convert_to_float(unpack(number<0>{}), scale); #endif } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp32x2_t() const +CK_TILE_HOST_DEVICE constexpr fp32x2_t pk_fp4_t::to_fp32x2(float scale) const { #if CK_TILE_FP4_CVT_DEVICE - return impl::_from_f4(data); + return impl::_from_f4(data, scale); #else - return fp32x2_t{convert_to_float(unpack(number<0>{})), - convert_to_float(unpack(number<1>{}))}; + return fp32x2_t{convert_to_float(unpack(number<0>{}), scale), + convert_to_float(unpack(number<1>{}), scale)}; #endif } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp16_t() const + +CK_TILE_HOST_DEVICE constexpr fp16_t pk_fp4_t::to_fp16(float scale) const { #if CK_TILE_FP4_CVT_DEVICE - return impl::_from_f4(data); + return impl::_from_f4(data, scale); #else - return fp16_t{type_convert(convert_to_float(unpack(number<0>{})))}; + return fp16_t{type_convert(convert_to_float(unpack(number<0>{}), scale))}; #endif } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp16x2_t() const +CK_TILE_HOST_DEVICE constexpr fp16x2_t pk_fp4_t::to_fp16x2(float scale) const { #if CK_TILE_FP4_CVT_DEVICE - return impl::_from_f4(data); + return impl::_from_f4(data, scale); #else - return fp16x2_t{type_convert(convert_to_float(unpack(number<0>{}))), - type_convert(convert_to_float(unpack(number<1>{})))}; + return fp16x2_t{type_convert(convert_to_float(unpack(number<0>{}), scale)), + type_convert(convert_to_float(unpack(number<1>{}), scale))}; #endif } #else -CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator float() const +CK_TILE_HOST_DEVICE constexpr float pk_fp4_t::to_float(float scale) const { - return e2m1_to_fp32_table[data & 0xf]; + return e2m1_to_fp32_table[unpack(number<0>{})] * scale; } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp32x2_t() const +CK_TILE_HOST_DEVICE constexpr fp32x2_t pk_fp4_t::to_fp32x2(float scale) const { - return fp32x2_t{e2m1_to_fp32_table[data & 0xf], e2m1_to_fp32_table[data >> 4]}; + return fp32x2_t{e2m1_to_fp32_table[unpack(number<0>{})] * scale, e2m1_to_fp32_table[unpack(number<1>{}] * scale}; } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp16_t() const +CK_TILE_HOST_DEVICE constexpr fp16_t pk_fp4_t::to_fp16(float scale) const { - return e2m1_to_fp16_table[data & 0xf]; + return type_convert(e2m1_to_fp16_table[unpack(number<0>{})]) * scale; } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp16x2_t() const +CK_TILE_HOST_DEVICE constexpr fp16x2_t pk_fp4_t::to_fp16x2(float scale) const { - return fp16x2_t{e2m1_to_fp16_table[data & 0xf], e2m1_to_fp16_table[data >> 4]}; + return fp16x2_t{ + type_convert(type_convert(e2m1_to_fp16_table[unpack(number<0>{})]) * scale), + type_convert(type_convert(e2m1_to_fp16_table[unpack(number<1>{})]) * scale)}; } #endif diff --git a/include/ck_tile/core/numeric/type_convert.hpp b/include/ck_tile/core/numeric/type_convert.hpp index 94d6e3cd34..1455fce0ea 100644 --- a/include/ck_tile/core/numeric/type_convert.hpp +++ b/include/ck_tile/core/numeric/type_convert.hpp @@ -64,6 +64,7 @@ CK_TILE_TYPE_CONVERT(bf8_t, bf8, float, float) CK_TILE_TYPE_CONVERT(float, float, int8_t, int8) CK_TILE_TYPE_CONVERT(int8_t, int8, float, float) +#undef CK_TILE_TYPE_CONVERT } // namespace ck_tile @@ -71,16 +72,36 @@ CK_TILE_TYPE_CONVERT(int8_t, int8, float, float) namespace ck_tile { -CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp32x2_t, fp32x2) -CK_TILE_TYPE_CONVERT(fp32x2_t, fp32x2, pk_fp4_t, pk_fp4) -CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp16x2_t, fp16x2) -CK_TILE_TYPE_CONVERT(fp16x2_t, fp16x2, pk_fp4_t, pk_fp4) -CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, bf16x2_t, bf16x2) -CK_TILE_TYPE_CONVERT(bf16x2_t, bf16x2, pk_fp4_t, pk_fp4) -CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, float, float) -CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, bf16_t, bf16) -CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp16_t, fp16) -#undef CK_TILE_TYPE_CONVERT +template +CK_TILE_HOST_DEVICE constexpr Y scaled_type_convert(X x, float scale); + +#define CK_TILE_SCALED_TYPE_CONVERT(dtype_, dname_, stype_, sname_) \ + template <> \ + CK_TILE_HOST_DEVICE constexpr dtype_ scaled_type_convert(stype_ x, \ + float scale) \ + { \ + return sname_##_to_##dname_(x, scale); \ + } \ + template <> \ + CK_TILE_HOST_DEVICE constexpr dtype_ type_convert(stype_ x) \ + { \ + return sname_##_to_##dname_(x, 1.f); \ + } + +CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp32x2_t, fp32x2) +CK_TILE_SCALED_TYPE_CONVERT(fp32x2_t, fp32x2, pk_fp4_t, pk_fp4) +CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp16x2_t, fp16x2) +CK_TILE_SCALED_TYPE_CONVERT(fp16x2_t, fp16x2, pk_fp4_t, pk_fp4) +CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, bf16x2_t, bf16x2) +CK_TILE_SCALED_TYPE_CONVERT(bf16x2_t, bf16x2, pk_fp4_t, pk_fp4) +CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, float, float) +CK_TILE_SCALED_TYPE_CONVERT(float, float, pk_fp4_t, pk_fp4) +CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, bf16_t, bf16) +CK_TILE_SCALED_TYPE_CONVERT(bf16_t, bf16, pk_fp4_t, pk_fp4) +CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp16_t, fp16) +CK_TILE_SCALED_TYPE_CONVERT(fp16_t, fp16, pk_fp4_t, pk_fp4) +#undef CK_TILE_SCALED_TYPE_CONVERT + #endif } // namespace ck_tile diff --git a/include/ck_tile/host/host_tensor.hpp b/include/ck_tile/host/host_tensor.hpp index c3f1b7d221..b7329fcac7 100644 --- a/include/ck_tile/host/host_tensor.hpp +++ b/include/ck_tile/host/host_tensor.hpp @@ -409,7 +409,13 @@ struct HostTensor } // void SetZero() { ck_tile::ranges::fill(mData, 0); } - void SetZero() { std::fill(mData.begin(), mData.end(), 0); } + void SetZero() + { + if constexpr(std::is_same_v) + std::fill(mData.begin(), mData.end(), e8m0_t{1.f}); + else + std::fill(mData.begin(), mData.end(), 0); + } template void ForEach_impl(F&& f, std::vector& idx, size_t rank) diff --git a/test/ck_tile/data_type/CMakeLists.txt b/test/ck_tile/data_type/CMakeLists.txt index a9461dca9c..384fd3c1c4 100644 --- a/test/ck_tile/data_type/CMakeLists.txt +++ b/test/ck_tile/data_type/CMakeLists.txt @@ -3,6 +3,7 @@ if(GPU_TARGETS MATCHES "gfx9") endif() if(GPU_TARGETS MATCHES "gfx95") add_gtest_executable(test_ck_tile_pk_fp4 test_pk_fp4.cpp) + add_gtest_executable(test_ck_tile_mx_scale test_mx_scale.cpp) endif() if(CK_USE_OCP_FP8 OR CK_USE_FNUZ_FP8) diff --git a/test/ck_tile/data_type/test_mx_scale.cpp b/test/ck_tile/data_type/test_mx_scale.cpp new file mode 100644 index 0000000000..7a024d238f --- /dev/null +++ b/test/ck_tile/data_type/test_mx_scale.cpp @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "gtest/gtest.h" +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" + +using ck_tile::bf16_t; +using ck_tile::bf16x2_t; +using ck_tile::fp16_t; +using ck_tile::fp16x2_t; +using ck_tile::fp32_t; +using ck_tile::fp32x2_t; +using ck_tile::number; +using ck_tile::pk_fp4_t; + +template +CK_TILE_HOST void test_convert(); + +using ck_tile::e8m0_raw_t; +using ck_tile::e8m0_t; + +TEST(OCP_Scale, NumericLimits) +{ + EXPECT_EQ(ck_tile::numeric::has_inf(), false); + EXPECT_EQ(ck_tile::numeric::zero(), ck_tile::numeric::signaling_NaN()); + EXPECT_EQ(ck_tile::numeric::min(), e8m0_t{e8m0_raw_t{0b00000000}}); + EXPECT_EQ(ck_tile::numeric::max(), e8m0_t{e8m0_raw_t{0b11111110}}); +} +TEST(OCP_Scale, NumericBasic) +{ + auto scale_1 = e8m0_t{1.0f}; + auto scale_2 = e8m0_t{e8m0_raw_t{ck_tile::numeric_traits::bias}}; // 2^0 + EXPECT_EQ(scale_1, scale_2); + + auto scale_3 = e8m0_t{8.0f}; + auto scale_4 = e8m0_t{e8m0_raw_t{3 + ck_tile::numeric_traits::bias}}; // 2^3 + EXPECT_EQ(scale_3, scale_4); +} + +TEST(OCP_Scale, ScaledConvertDevice) +{ + constexpr bool is_device = true; + test_convert(); // fp32 -> fp4 -> fp32 + test_convert(); + test_convert(); + test_convert(); + test_convert(); + test_convert(); + test_convert(); +} +TEST(OCP_Scale, ScaledConvertHost) +{ + constexpr bool is_device = false; + test_convert(); // fp32 -> fp4 -> fp32 + test_convert(); + test_convert(); + test_convert(); + test_convert(); + test_convert(); + test_convert(); +} +TEST(OCP_Scale, tensorInit) +{ + using scale_t = e8m0_t; + ck_tile::HostTensor scales({10, 10}); + ck_tile::FillUniformDistribution{1.f, 1.f}(scales); + scales.SetZero(); +} + +#define toPF4(x, y) ck_tile::scaled_type_convert(x, y) +#define toDST(x, y) ck_tile::scaled_type_convert(x, y) +#define toDSTx2(x, y) ck_tile::scaled_type_convert(x, y) + +#define toF32(x) ck_tile::type_convert(x) +#define toPF4_(x) ck_tile::type_convert(x) +#define toSRC(x) ck_tile::type_convert(x) +#define toDST_(x) ck_tile::type_convert(x) + +template +__global__ void MyKernel(Args... args) +{ + Kernel{}(args...); +} +template +struct SrcPkfp4Dst +{ + CK_TILE_HOST_DEVICE void + operator()(const SRC* src, DST* dst, e8m0_t scale1, e8m0_t scale2) const + { + + using SRCx2_t = ck_tile::ext_vector_t; + using DSTx2_t = ck_tile::ext_vector_t; + + ck_tile::static_for<0, N, 2>{}([&](auto i) { + const auto input2 = SRCx2_t{src[i], src[i + 1]}; + + if(i % 4 == 0) + { + // ex: fp32_t -> fp4 -> bf16_t + dst[i] = toDST(toPF4(src[i], scale1), scale2); + // ex: fp32x2_t -> pk_fp4 -> unpack<0> -> bf16_t + dst[i + 1] = toDST(toPF4_(toPF4(input2, scale1).unpack(number<1>{})), scale2); + } + else + { + // ex: fp32x2_t -> pk_fp4_t -> bf16x2_t + reinterpret_cast(dst)[i >> 1] = toDSTx2(toPF4(input2, scale1), scale2); + } + }); + } +}; + +template +CK_TILE_HOST void test_convert() +{ + const auto test_data = std::array{4.f, 6.f, 8.f, 10.f}; + const auto ref_data = std::array{8.f, 16.f, 16.f, 16.f}; + const auto scale1 = e8m0_t{8.0f}; + const auto scale2 = e8m0_t{16.0f}; + + static_assert(test_data.size() == ref_data.size()); + static_assert(test_data.size() % 2 == 0); + + constexpr int N = test_data.size(); + std::array in; + std::array ref, out; + + // prepare input and ground truth in host + for(int i = 0; i < N; ++i) + { + in[i] = toSRC(test_data[i]); + ref[i] = toDST_(ref_data[i]); + EXPECT_EQ(test_data[i], toF32(in[i])); + EXPECT_EQ(ref_data[i], toF32(ref[i])); + } + + using job = SrcPkfp4Dst; + + if constexpr(is_device) + { + auto in_d = std::make_unique(in.size() * sizeof(SRC)); + auto out_d = std::make_unique(out.size() * sizeof(DST)); + in_d->ToDevice(in.data()); + + MyKernel<<<1, 1>>>(reinterpret_cast(in_d->GetDeviceBuffer()), + reinterpret_cast(out_d->GetDeviceBuffer()), + scale1, + scale2); + + out_d->FromDevice(out.data()); + } + else + { + job{}(in.data(), out.data(), scale1, scale2); + } + + for(int i = 0; i < N; ++i) + EXPECT_EQ(ref[i], out[i]) << "i:" << i; +} From 3c9400471dcd4b3f55d8f6b88b562bda63b75657 Mon Sep 17 00:00:00 2001 From: Sami Remes Date: Fri, 8 Aug 2025 02:03:49 +0300 Subject: [PATCH 20/21] [CK_TILE] Enable persistent kernel and tail handler in tile_engine (#2300) * Enable persistent kernel in tile_engine and use tail handler * Fix formatting * Add persistent to default_config.json * Remove extra newlines and add persistent also to user config * Reduce instances from default_config.json * add persistent to benchmark.json and custom_ci_config.json * changed the config file to have few instances --------- Co-authored-by: Thomas Ning Co-authored-by: ThomasNing --- tile_engine/ops/gemm/codegen_utils.py | 89 ------------------- tile_engine/ops/gemm/configs/benchmark.json | 6 ++ .../ops/gemm/configs/custom_ci_config.json | 6 ++ .../ops/gemm/configs/default_config.json | 7 +- .../gemm/configs/user_provided_config.json | 6 ++ tile_engine/ops/gemm/gemm_host_api.hpp | 16 ++-- tile_engine/ops/gemm/gemm_instance_builder.py | 51 +++++------ tile_engine/ops/gemm/json_config.py | 4 + 8 files changed, 60 insertions(+), 125 deletions(-) diff --git a/tile_engine/ops/gemm/codegen_utils.py b/tile_engine/ops/gemm/codegen_utils.py index 9ff76724cc..4a990f3309 100644 --- a/tile_engine/ops/gemm/codegen_utils.py +++ b/tile_engine/ops/gemm/codegen_utils.py @@ -65,93 +65,6 @@ CSHUFFLE_EPILOGUE = """ UniversalGemmProblem::TransposeC, memory_operation>>; """ -HOT_LOOP_FALSE = """ - if(tail_num == ck_tile::TailNumber::Full) - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - else if(tail_num == ck_tile::TailNumber::Odd) - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - else if(tail_num == ck_tile::TailNumber::Even) - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - else - { - throw std::runtime_error("Num K loop must be larger than number of prefetech stages."); - } -""" -RUN_MEM = """ - // Handle One and Full cases directly - if (tail_num == ck_tile::TailNumber::One) { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } else if (tail_num == ck_tile::TailNumber::Full) { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - - auto check_tail = [&](auto... TNs) { - ([&]{ - if constexpr(BaseGemmPipeline::PrefetchStages > static_cast(decltype(TNs)::value)) { - if(tail_num == decltype(TNs)::value) { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - } - }(), ...); - }; - - check_tail( - ck_tile::integral_constant{}, - ck_tile::integral_constant{}, - ck_tile::integral_constant{}, - ck_tile::integral_constant{}, - ck_tile::integral_constant{}, - ck_tile::integral_constant{} - ); -""" - -RUN_COMPV3 = """ - if(tail_num == ck_tile::TailNumber::Full) - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - else if(tail_num == ck_tile::TailNumber::Odd) - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - else if(tail_num == ck_tile::TailNumber::Even) - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - else - { - throw std::runtime_error("The tail number is wrong. It should be Full, Odd, or Even."); - } -""" - -RUN_COMPV4 = """ - if(tail_num == ck_tile::TailNumber::Three) - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - else - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } -""" - PIPELINE_MAP = { "mem": ["ck_tile::BaseGemmPipelineAgBgCrMem", "ck_tile::GemmPipelineAgBgCrMem"], @@ -172,8 +85,6 @@ SCHEDULER_MAP = { EPILOGUE_MAP = {"default": DEFAULT_EPILOGUE, "cshuffle": CSHUFFLE_EPILOGUE} -HOT_LOOP_TRUE = {"mem": RUN_MEM, "compv3": RUN_COMPV3, "compv4": RUN_COMPV4} - def BOOL_MAP(b_): return {True: "true", False: "false"}[bool(b_)] diff --git a/tile_engine/ops/gemm/configs/benchmark.json b/tile_engine/ops/gemm/configs/benchmark.json index 1560698b77..def3ca4453 100644 --- a/tile_engine/ops/gemm/configs/benchmark.json +++ b/tile_engine/ops/gemm/configs/benchmark.json @@ -96,6 +96,12 @@ "values": [ false ] + }, + "persistent": { + "values": [ + false, + true + ] } } } \ No newline at end of file diff --git a/tile_engine/ops/gemm/configs/custom_ci_config.json b/tile_engine/ops/gemm/configs/custom_ci_config.json index 9187fb01eb..ca6c7230fd 100644 --- a/tile_engine/ops/gemm/configs/custom_ci_config.json +++ b/tile_engine/ops/gemm/configs/custom_ci_config.json @@ -77,6 +77,12 @@ "values": [ false ] + }, + "persistent": { + "values": [ + false, + true + ] } } } \ No newline at end of file diff --git a/tile_engine/ops/gemm/configs/default_config.json b/tile_engine/ops/gemm/configs/default_config.json index 12a8ddd4b7..5bd51b809a 100644 --- a/tile_engine/ops/gemm/configs/default_config.json +++ b/tile_engine/ops/gemm/configs/default_config.json @@ -95,6 +95,11 @@ "values": [ false ] + }, + "persistent": { + "values": [ + false + ] } } -} \ No newline at end of file +} diff --git a/tile_engine/ops/gemm/configs/user_provided_config.json b/tile_engine/ops/gemm/configs/user_provided_config.json index 5761b39ada..76e194f6b9 100644 --- a/tile_engine/ops/gemm/configs/user_provided_config.json +++ b/tile_engine/ops/gemm/configs/user_provided_config.json @@ -82,6 +82,12 @@ "values": [ false ] + }, + "persistent": { + "values": [ + false, + true + ] } } } \ No newline at end of file diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp index 2c4af8955f..f28f5dd29c 100644 --- a/tile_engine/ops/gemm/gemm_host_api.hpp +++ b/tile_engine/ops/gemm/gemm_host_api.hpp @@ -144,7 +144,8 @@ inline auto create_args(int argc, char* argv[]) .insert("pad_k", "false", "Whether pad or not in k direction. Possible values are true or false. Default is " - "false."); + "false.") + .insert("persistent", "false", "Whether to use persistent kernel. Default is false."); bool result = arg_parser.parse(argc, argv); return std::make_tuple(result, arg_parser); @@ -208,12 +209,13 @@ void permute_vectors_i4x4_b(Tensor& tensor) auto get_kernel_func_by_trait(const ck_tile::ArgParser& arg_parser) { KernelTraits trait; - trait.pipeline = arg_parser.get_str("pipeline"); - trait.scheduler = arg_parser.get_str("scheduler"); - trait.epilogue = arg_parser.get_str("epilogue"); - trait.pad_m = arg_parser.get_bool("pad_m"); - trait.pad_n = arg_parser.get_bool("pad_n"); - trait.pad_k = arg_parser.get_bool("pad_k"); + trait.pipeline = arg_parser.get_str("pipeline"); + trait.scheduler = arg_parser.get_str("scheduler"); + trait.epilogue = arg_parser.get_str("epilogue"); + trait.pad_m = arg_parser.get_bool("pad_m"); + trait.pad_n = arg_parser.get_bool("pad_n"); + trait.pad_k = arg_parser.get_bool("pad_k"); + trait.persistent = arg_parser.get_bool("persistent"); bool structured_sparsity = arg_parser.get_bool("structured_sparsity"); diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py index 4a35a2bcd3..6d713bdcb8 100755 --- a/tile_engine/ops/gemm/gemm_instance_builder.py +++ b/tile_engine/ops/gemm/gemm_instance_builder.py @@ -15,16 +15,9 @@ from json_config import GemmConfig, RangeConfigParam from codegen_utils import ( DATA_TYPE_MAP, LAYOUT_MAP, - DEFAULT_EPILOGUE, - CSHUFFLE_EPILOGUE, - HOT_LOOP_FALSE, - RUN_MEM, - RUN_COMPV3, - RUN_COMPV4, PIPELINE_MAP, SCHEDULER_MAP, EPILOGUE_MAP, - HOT_LOOP_TRUE, BOOL_MAP, warp_tile_supported_combinations, trait_unsupported_combinations, @@ -114,7 +107,7 @@ class GemmCodeGenerator: def _generate_all_traits(self): """Generate all possible kernel traits names.""" - params = ["pipeline", "epilogue", "scheduler", "pad_m", "pad_n", "pad_k"] + params = ["pipeline", "epilogue", "scheduler", "pad_m", "pad_n", "pad_k", "persistent"] # Generate all unique_combinations _unique = set( @@ -124,13 +117,14 @@ class GemmCodeGenerator: ) for combo in _unique: - pipeline, epilogue, scheduler, pad_m, pad_n, pad_k = combo + pipeline, epilogue, scheduler, pad_m, pad_n, pad_k, persistent = combo current_combination = (pipeline, epilogue, scheduler) if current_combination not in trait_unsupported_combinations: trait_name = ( f"{pipeline}_{epilogue}_{scheduler}_" - f"{BOOL_MAP(pad_m)}_{BOOL_MAP(pad_n)}_{BOOL_MAP(pad_k)}" + f"{BOOL_MAP(pad_m)}_{BOOL_MAP(pad_n)}_{BOOL_MAP(pad_k)}_" + f"{BOOL_MAP(persistent)}" ) self.valid_trait_names.append(trait_name) else: @@ -189,7 +183,7 @@ using CLayout = {LAYOUT_MAP[self.config.problem.layout_map["matrix_c"]]}; def _generate_trait_file(self, trait: str): """Generate a trait with all tile/warp combinations.""" - pipeline, epilogue, scheduler, pad_m, pad_n, pad_k = trait.split("_") + pipeline, epilogue, scheduler, pad_m, pad_n, pad_k, persistent = trait.split("_") filename = f"gemm_{trait}.hpp" content = f"""// SPDX-License-Identifier: MIT @@ -206,8 +200,7 @@ namespace {trait} {{ """ # Add template struct with configuration content += self._generate_kernel_struct( - pipeline, epilogue, scheduler, pad_m, pad_n, pad_k - ) + pipeline, epilogue, scheduler, pad_m, pad_n, pad_k, persistent) content += f"\n}} // namespace {trait}\n" (self.output_dir / filename).write_text(content) @@ -220,6 +213,7 @@ namespace {trait} {{ pad_m: str, pad_n: str, pad_k: str, + persistent: str, ) -> str: """Generate the code block of kernel struct""" return f""" @@ -229,9 +223,10 @@ template struct GemmKernel {{ - static constexpr bool kPadM = {pad_m}; - static constexpr bool kPadN = {pad_n}; - static constexpr bool kPadK = {pad_k}; + static constexpr bool kPadM = {pad_m}; + static constexpr bool kPadN = {pad_n}; + static constexpr bool kPadK = {pad_k}; + static constexpr bool kPersistent = {persistent}; static float launch(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {{ static constexpr bool permuteA = false; @@ -250,7 +245,6 @@ struct GemmKernel {{ permuteA, permuteB>; - using TilePartitioner = ck_tile::GemmSpatiallyLocalTilePartitioner; + ALayout, BLayout, CLayout, TransposeC, + structured_sparsity, kPersistent>; using GemmPipelineProblem = ck_tile::GemmPipelineProblem; @@ -297,14 +292,14 @@ struct GemmKernel {{ using Kernel = ck_tile::GemmKernel; auto kargs = Kernel::MakeKernelArgs(args); - const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch); - constexpr dim3 blocks = Kernel::BlockSize(); - if(!Kernel::IsSupportedArgument(kargs)) {{ throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!"); }} + constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 grids = {'Kernel::MaxOccupancyGridSize(stream)' if persistent == 'true' else 'Kernel::GridSize(args.M, args.N, args.k_batch)'}; + if(stream.log_level_ > 0) {{ std::cout << "Launching kernel with args:" @@ -377,11 +372,7 @@ struct GemmKernel {{ }} }}; - if(has_hot_loop) {{ - {HOT_LOOP_TRUE[pipeline]} - }} else {{ - {HOT_LOOP_FALSE} - }} + BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num); return ave_time; }} @@ -395,7 +386,8 @@ struct GemmKernel {{ "{pad_k}" + "_" + "{pipeline}" + "_" + "{epilogue}" + "_" + - "{scheduler}"; + "{scheduler}" + "_" + + "{persistent}"; }} }}; """ @@ -673,6 +665,8 @@ struct KernelTraits bool pad_n; /// @brief Indicates whether padding is applied to the K dimension. bool pad_k; + /// @brief Indicates whether the kernel is persistent. + bool persistent; }; struct GemmDispatcher { @@ -773,7 +767,8 @@ private: trait.scheduler + "_" + (trait.pad_m ? "true" : "false") + "_" + (trait.pad_n ? "true" : "false") + "_" + - (trait.pad_k ? "true" : "false"); + (trait.pad_k ? "true" : "false") + "_" + + (trait.persistent ? "true" : "false"); } }; diff --git a/tile_engine/ops/gemm/json_config.py b/tile_engine/ops/gemm/json_config.py index 675a2052ef..04f2dd4890 100644 --- a/tile_engine/ops/gemm/json_config.py +++ b/tile_engine/ops/gemm/json_config.py @@ -107,6 +107,7 @@ class TraitConfig: pad_m: EnumConfigParam pad_n: EnumConfigParam pad_k: EnumConfigParam + persistent: EnumConfigParam @dataclass @@ -215,6 +216,9 @@ class GemmConfig: pad_k=EnumConfigParam( values=config_dict["trait_config"]["pad_k"]["values"] ), + persistent=EnumConfigParam( + values=config_dict["trait_config"]["persistent"]["values"] + ), ) return cls( From ab26026835b0766e068ed4458b3f7a17633ca7a7 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Thu, 7 Aug 2025 16:51:53 -0700 Subject: [PATCH 21/21] [CK-tile] add more tests for batched transpose testing the rectangular block tile sizes (#2634) * add failing tests * swap out and reference * add constraint assert to transpose input distribution * test both pipelines with rectangular block tile * print mismatched indices * add a smaller failing test for old pipeline * print grid and block * fill output before operating on it * swap m/n tile sizes and make one test pass * add device syncs * add one more flipped test case * flip block tile at host arg init * fix tiles for lds pipeline * clang-format * rename tests * roll back error check * remove device syncs * reduce large test case's size --- .../kernel/batched_transpose_kernel.hpp | 40 ++++---- .../batched_transpose_common_policy.hpp | 16 ++-- .../batched_transpose_lds_problem.hpp | 20 ++-- .../pipeline/batched_transpose_policy.hpp | 2 +- .../test_batched_transpose.cpp | 92 +++++++++++++++++-- 5 files changed, 127 insertions(+), 43 deletions(-) diff --git a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp index a89a190489..a4150e8d84 100644 --- a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp +++ b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp @@ -49,9 +49,11 @@ struct BatchedTransposeKernel CK_TILE_HOST static constexpr auto GridSize(const Hargs& host_args) { - size_t grid_size_x = (host_args.height + host_args.dim_block_h - 1) / host_args.dim_block_h; - size_t grid_size_y = (host_args.width + host_args.dim_block_w - 1) / host_args.dim_block_w; - size_t grid_size_z = host_args.batch; + const size_t grid_size_x = + ck_tile::integer_divide_ceil(host_args.height, host_args.dim_block_h); + const size_t grid_size_y = + ck_tile::integer_divide_ceil(host_args.width, host_args.dim_block_w); + const size_t grid_size_z = host_args.batch; return dim3(grid_size_x, grid_size_y, grid_size_z); } @@ -71,41 +73,43 @@ struct BatchedTransposeKernel CK_TILE_DEVICE void operator()(Kargs kargs) const { - static constexpr ck_tile::index_t kMPerBlock = Problem::kMPerBlock; - static constexpr ck_tile::index_t kNPerBlock = Problem::kNPerBlock; - static constexpr bool kPadM = Problem::kPadM; - static constexpr bool kPadN = Problem::kPadN; - static constexpr ck_tile::index_t VectorSizeInput = Problem::VectorSizeInput; - static constexpr ck_tile::index_t VectorSizeOutput = Problem::VectorSizeOutput; + static constexpr ck_tile::index_t kMPerBlock = Problem::kMPerBlock; + static constexpr ck_tile::index_t kNPerBlock = Problem::kNPerBlock; + static constexpr bool kPadM = Problem::kPadM; + static constexpr bool kPadN = Problem::kPadN; + static constexpr ck_tile::index_t VectorSizeInput = Problem::VectorSizeInput; + static constexpr ck_tile::index_t VectorStrideInput = 1; + static constexpr ck_tile::index_t VectorSizeOutput = Problem::VectorSizeOutput; + static constexpr ck_tile::index_t VectorStrideOutput = 1; - const auto iM = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock); - const auto iN = __builtin_amdgcn_readfirstlane(blockIdx.y * kNPerBlock); - const auto iDim = blockIdx.z; + const auto iM = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock); + const auto iN = __builtin_amdgcn_readfirstlane(blockIdx.y * kNPerBlock); + const auto offset = __builtin_amdgcn_readfirstlane(blockIdx.z * kargs.height * kargs.width); const auto x_m_n = [&]() { const auto x_dram_naive = make_naive_tensor_view( - static_cast(kargs.p_input) + iDim * kargs.dim_stride, + static_cast(kargs.p_input) + offset, make_tuple(kargs.height, kargs.width), make_tuple(kargs.width, 1), number{}, - number<1>{}); + number{}); return pad_tensor_view(x_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); }(); const auto y_n_m = [&]() { const auto y_dram_naive = make_naive_tensor_view( - static_cast(kargs.p_output) + iDim * kargs.dim_stride, + static_cast(kargs.p_output) + offset, make_tuple(kargs.width, kargs.height), make_tuple(kargs.height, 1), number{}, - number<1>{}); + number{}); return pad_tensor_view(y_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); }(); auto x_block_window = make_tile_window( diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp index e344c24bf5..3b8d5a142e 100644 --- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp +++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp @@ -15,15 +15,15 @@ struct BatchedTransposeCommonPolicy template CK_TILE_DEVICE static constexpr auto MakeInputDistribution() { - constexpr index_t BlockSize = Problem::kBlockSize; - constexpr index_t LeadDimPerBlock = Problem::kMPerBlock; - constexpr index_t SecondDimPerBlock = Problem::kNPerBlock; + constexpr index_t kBlockSize = Problem::kBlockSize; + constexpr index_t kLeadDimPerBlock = Problem::kNPerBlock; + constexpr index_t kSecondDimPerBlock = Problem::kMPerBlock; - constexpr index_t kVectorSize = Problem::VectorSizeOutput; - - using TileEncodingPattern = TileDistributionEncodingPattern2D; return TileEncodingPattern::Make2DStaticTileDistribution(); diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp index 491db37564..45803ae2da 100644 --- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp +++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp @@ -18,19 +18,19 @@ struct BatchedTransposeLdsProblem { using DataType = remove_cvref_t; - static constexpr index_t kRowWarps_ = NumWarps::at(number<1>{}); - static constexpr index_t kColWarps_ = NumWarps::at(number<0>{}); + static constexpr index_t kRowWarps_ = NumWarps::at(number<0>{}); + static constexpr index_t kColWarps_ = NumWarps::at(number<1>{}); static constexpr index_t kBlockSize_ = get_warp_size() * kRowWarps_ * kColWarps_; - static constexpr index_t kRowPerBlock_ = BlockTile::at(number<1>{}); - static constexpr index_t kColPerBlock_ = BlockTile::at(number<0>{}); + static constexpr index_t kRowPerBlock_ = BlockTile::at(number<0>{}); + static constexpr index_t kColPerBlock_ = BlockTile::at(number<1>{}); static constexpr index_t kBlockSize = kBlockSize_; // warps per block - static constexpr index_t kLeadNumWarps = kRowWarps_; - static constexpr index_t kSecondNumWarps = kColWarps_; + static constexpr index_t kLeadNumWarps = kColWarps_; + static constexpr index_t kSecondNumWarps = kRowWarps_; - static constexpr index_t kLeadSizePerBlock = kRowPerBlock_; - static constexpr index_t kSecondSizePerBlock = kColPerBlock_; + static constexpr index_t kLeadSizePerBlock = kColPerBlock_; + static constexpr index_t kSecondSizePerBlock = kRowPerBlock_; static constexpr index_t kQuadrantLeadDim = LaneGroupTransposeTraits::kleadDim; static constexpr index_t kQuadrantSecondDim = LaneGroupTransposeTraits::ksecondDim; @@ -60,8 +60,8 @@ struct BatchedTransposeLdsProblem static constexpr bool kPadM = kPadM_; static constexpr bool kPadN = kPadN_; - static constexpr auto kMPerBlock = kLeadSizePerBlock; - static constexpr auto kNPerBlock = kSecondSizePerBlock; + static constexpr auto kMPerBlock = kSecondSizePerBlock; + static constexpr auto kNPerBlock = kLeadSizePerBlock; // 128-bit is the max single-instruction bandwidth for load/store static constexpr index_t MaxLoadStoreSize = 16; diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp index 5238fecdc5..e6bbc709ea 100644 --- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp +++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp @@ -19,8 +19,8 @@ struct BatchedTransposePolicy : public BatchedTransposeCommonPolicy constexpr index_t VecLoadSize = Problem::VectorSizeOutput; using TileEncodingPattern = TileDistributionEncodingPattern2D; return TileEncodingPattern::MakeShuffled2DStaticTileDistribution(); diff --git a/test/ck_tile/batched_transpose/test_batched_transpose.cpp b/test/ck_tile/batched_transpose/test_batched_transpose.cpp index cce00e27cb..77d5825eed 100644 --- a/test/ck_tile/batched_transpose/test_batched_transpose.cpp +++ b/test/ck_tile/batched_transpose/test_batched_transpose.cpp @@ -95,10 +95,12 @@ class TestCkTileBatchedTranspose // N C H W layout_in== ck_tile::HostTensor y_ref(Y_dim, Y_stride); ck_tile::FillUniformDistribution{-.5f, .5f}(x_host); + ck_tile::FillConstant{-37}(y_host); ck_tile::DeviceMem x_dev(x_host.get_element_space_size_in_bytes()); ck_tile::DeviceMem y_dev(y_host.get_element_space_size_in_bytes()); x_dev.ToDevice(x_host.data()); + y_dev.ToDevice(y_host.data()); using Kernel = typename Config::Kernel; @@ -131,8 +133,8 @@ class TestCkTileBatchedTranspose // N C H W layout_in== height, width, height * width, - Config::BlockTile::at(1), - Config::BlockTile::at(0)}; + Config::BlockTile::at(0), + Config::BlockTile::at(1)}; auto kargs = Kernel::MakeKargs(host_args); auto sc = ck_tile::stream_config{}; @@ -140,15 +142,24 @@ class TestCkTileBatchedTranspose // N C H W layout_in== constexpr dim3 block_size = Kernel::BlockSize(); ck_tile::launch_kernel( sc, ck_tile::make_kernel(Kernel{}, grid_size, block_size, 0, kargs)); + y_dev.FromDevice(y_host.data()); ck_tile::reference_batched_transpose(x_host, y_ref, layout_in, layout_out); std::ostringstream message; message << "N=" << N << " C=" << C << " H=" << H << " W=" << W << " layout_in=" << layout_in - << " layout_out=" << layout_out << " device_name=" << device_name; + << " layout_out=" << layout_out << " grid_size={" << grid_size.x << ", " + << grid_size.y << ", " << grid_size.z << "} block_size=" << block_size.x + << " device_name=" << device_name; + // NB: order of output and reference matters bool pass = ck_tile::check_err( - y_ref, y_host, message.str(), /* rtol */ 0, /* atol */ 0, /* allow inf */ false); + /* out */ y_host, + /* ref */ y_ref, + message.str(), + /* rtol */ 0, + /* atol */ 0, + /* allow inf */ false); EXPECT_TRUE(pass); } @@ -160,14 +171,16 @@ static const auto kTestingValues = ::testing::Values( // N C H W layout_in==NCHW std::tuple{1, 32, 1, 32, true}, std::tuple{1, 64, 1, 64, true}, + std::tuple{1, 32, 1, 64, true}, + std::tuple{1, 64, 1, 32, true}, std::tuple{2, 12, 1, 32, false}, std::tuple{3, 1334, 1, 37, false}, std::tuple{4, 27, 1, 32, true}, std::tuple{5, 1234, 1, 12, true}, std::tuple{1, 1, 1, 1, true}, std::tuple{1, 1, 1, 1, false}, - std::tuple{128, 1024, 64, 64, true}, - std::tuple{128, 1024, 64, 64, false}, + std::tuple{17, 1024, 64, 64, true}, + std::tuple{17, 1024, 64, 64, false}, std::tuple{16, 64, 32, 128, true}, std::tuple{16, 64, 128, 32, false}, std::tuple{1, 2048, 1, 1, true}, @@ -239,6 +252,60 @@ class CaseHalfPadMultiWarpLoadTranspose { }; +class CaseHalfPadMultiWarp128MNLoadTranspose + : public TestCkTileBatchedTranspose> +{ +}; + +class CaseHalfPadMultiWarp128MN + : public TestCkTileBatchedTranspose< + PipelineConfig> +{ +}; + +class CaseHalfPadRectTile1 + : public TestCkTileBatchedTranspose< + PipelineConfig> +{ +}; + +class CaseHalfPadRectTile2 + : public TestCkTileBatchedTranspose< + PipelineConfig> +{ +}; + +class CaseHalfPadRectTile1LoadTranspose + : public TestCkTileBatchedTranspose> +{ +}; + +class CaseHalfPadRectTile2LoadTranspose + : public TestCkTileBatchedTranspose> +{ +}; + TEST_P(CaseHalf, TestCorrectness) { this->Run(GetParam()); } TEST_P(CaseByte, TestCorrectness) { this->Run(GetParam()); } TEST_P(CaseWord, TestCorrectness) { this->Run(GetParam()); } @@ -248,6 +315,12 @@ TEST_P(CaseHalfPad, TestCorrectness) { this->Run(GetParam()); } TEST_P(CaseHalfPadLoadTranspose, TestCorrectness) { this->Run(GetParam()); } TEST_P(CaseHalfPadMultiWarp, TestCorrectness) { this->Run(GetParam()); } TEST_P(CaseHalfPadMultiWarpLoadTranspose, TestCorrectness) { this->Run(GetParam()); } +TEST_P(CaseHalfPadMultiWarp128MN, TestCorrectness) { this->Run(GetParam()); } +TEST_P(CaseHalfPadMultiWarp128MNLoadTranspose, TestCorrectness) { this->Run(GetParam()); } +TEST_P(CaseHalfPadRectTile1, TestCorrectness) { this->Run(GetParam()); } +TEST_P(CaseHalfPadRectTile1LoadTranspose, TestCorrectness) { this->Run(GetParam()); } +TEST_P(CaseHalfPadRectTile2, TestCorrectness) { this->Run(GetParam()); } +TEST_P(CaseHalfPadRectTile2LoadTranspose, TestCorrectness) { this->Run(GetParam()); } // clang-format off INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalf, kTestingValues); @@ -259,4 +332,11 @@ INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPad, kTestingV INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadLoadTranspose, kTestingValues); INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarp, kTestingValues); INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarpLoadTranspose, kTestingValues); +INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarp128MN, kTestingValues); +INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarp128MNLoadTranspose, kTestingValues); +INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadRectTile1, kTestingValues); +INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadRectTile1LoadTranspose, kTestingValues); +INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadRectTile2, kTestingValues); +INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadRectTile2LoadTranspose, kTestingValues); + // clang-format on