diff --git a/experimental/builder/include/ck_tile/builder/testing/tensor_initialization.hpp b/experimental/builder/include/ck_tile/builder/testing/tensor_initialization.hpp index 62b3ad2207..5f977d00b4 100644 --- a/experimental/builder/include/ck_tile/builder/testing/tensor_initialization.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/tensor_initialization.hpp @@ -53,7 +53,7 @@ void init_tensor_buffer_uniform_int(void* buf, using ck_type = factory::internal::DataTypeToCK
::type; // we might be asked to generate int values on fp data types that don't have the required - // precision + // precision. Check using >= and <= because == is not allowed for floats. if(static_cast(max_value - 1) <= static_cast(min_value) && static_cast(max_value - 1) >= static_cast(min_value)) { diff --git a/experimental/grouped_convolution_tile_instances/CMakeLists.txt b/experimental/grouped_convolution_tile_instances/CMakeLists.txt index 1264a68906..9a75fdcff6 100644 --- a/experimental/grouped_convolution_tile_instances/CMakeLists.txt +++ b/experimental/grouped_convolution_tile_instances/CMakeLists.txt @@ -16,4 +16,5 @@ if(GPU_TARGETS MATCHES "gfx9") add_instance_library(device_grouped_conv_fwd_tile_instances ${GROUPED_CONV_FWD_TILE}) target_include_directories(device_grouped_conv_fwd_tile_instances PRIVATE "${PROJECT_SOURCE_DIR}/experimental/builder/test/utils") + target_compile_options(device_grouped_conv_fwd_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0) endif() diff --git a/experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_bf16.conf index d04973f05d..e91dd97563 100644 --- a/experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_bf16.conf +++ b/experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_bf16.conf @@ -58,9 +58,10 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pa DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 8, 8, 8, 1, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 2, 2, 2, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1> +# LargeTensor is temporary disable due to failures +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 2, 2, 2, 1, 1> +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> diff --git a/experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_fp16.conf index f273002e04..dad7cdffd8 100644 --- a/experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_fp16.conf +++ b/experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_fp16.conf @@ -58,9 +58,10 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pa DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 8, 8, 8, 1, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 2, 2, 2, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1> +# LargeTensor is temporary disable due to failures +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 2, 2, 2, 1, 1> +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> diff --git a/experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_fp32.conf index 34f58f5c04..4bf7a0b238 100644 --- a/experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_fp32.conf +++ b/experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_fp32.conf @@ -58,8 +58,9 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 16, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 16, Default, 32, 32, 4, 2, 4, 4, 4, 1, 1> +# LargeTensor is temporary disable due to failures +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 16, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 16, Default, 32, 32, 4, 2, 4, 4, 4, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> diff --git a/experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_bf16.conf index d5a4a28beb..580a0a1941 100644 --- a/experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_bf16.conf +++ b/experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_bf16.conf @@ -58,9 +58,10 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pa DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 8, 8, 8, 1, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 2, 2, 2, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1> +# LargeTensor is temporary disable due to failures +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 2, 2, 2, 1, 1> +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> diff --git a/experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_fp16.conf index e20fe50689..99457320b0 100644 --- a/experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_fp16.conf +++ b/experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_fp16.conf @@ -58,9 +58,10 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pa DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 8, 8, 8, 1, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 2, 2, 2, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1> +# LargeTensor is temporary disable due to failures +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 2, 2, 2, 1, 1> +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> diff --git a/experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_fp32.conf index 34f58f5c04..4bf7a0b238 100644 --- a/experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_fp32.conf +++ b/experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_fp32.conf @@ -58,8 +58,9 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 16, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 16, Default, 32, 32, 4, 2, 4, 4, 4, 1, 1> +# LargeTensor is temporary disable due to failures +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 16, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 16, Default, 32, 32, 4, 2, 4, 4, 4, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> diff --git a/experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_bf16.conf index fb44310793..73a3bd0357 100644 --- a/experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_bf16.conf +++ b/experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_bf16.conf @@ -7,7 +7,8 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Stride1Pad DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +# LargeTensor is temporary disable due to failures +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> diff --git a/experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_fp16.conf index fb44310793..73a3bd0357 100644 --- a/experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_fp16.conf +++ b/experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_fp16.conf @@ -7,7 +7,8 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Stride1Pad DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +# LargeTensor is temporary disable due to failures +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> diff --git a/experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_fp32.conf index 99afc4ad26..0f66f1f48f 100644 --- a/experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_fp32.conf +++ b/experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_fp32.conf @@ -7,7 +7,8 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Stride1Pad DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +# LargeTensor is temporary disable due to failures +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> diff --git a/experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_bf16.conf index fb44310793..73a3bd0357 100644 --- a/experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_bf16.conf +++ b/experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_bf16.conf @@ -7,7 +7,8 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Stride1Pad DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +# LargeTensor is temporary disable due to failures +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> diff --git a/experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_fp16.conf index 5b39beabc9..06ff7d20a4 100644 --- a/experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_fp16.conf +++ b/experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_fp16.conf @@ -7,7 +7,8 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Stride1Pad DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +# LargeTensor is temporary disable due to failures +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> @@ -39,11 +40,11 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<256, 64, 64, 64, Default, 16, 16, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<256, 64, 64, 64, Default, 16, 16, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<256, 64, 64, 64, Default, 16, 16, 2, 2, 2, 2, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<256, 64, 64, 64, Filter1x1Stride1Pad0, 16, 16, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<256, 64, 64, 64, Filter1x1Stride1Pad0, 16, 16, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<256, 64, 64, 64, Filter1x1Stride1Pad0, 16, 16, 2, 2, 2, 2, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> \ No newline at end of file +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<256, 64, 64, 64, Filter1x1Stride1Pad0, 16, 16, 2, 2, 2, 2, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> diff --git a/experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_fp32.conf index 99afc4ad26..0f66f1f48f 100644 --- a/experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_fp32.conf +++ b/experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_fp32.conf @@ -7,7 +7,8 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Stride1Pad DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> -DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +# LargeTensor is temporary disable due to failures +# DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index 3a7231f71d..f1aba16645 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -85,6 +85,13 @@ __device__ inline auto amd_wave_read_first_lane(const Object& obj) return out; } +// Overload for host to return the same value +template +__host__ inline T amd_wave_read_first_lane(T v) +{ + return v; +} + // 128 bit SGPRs to supply buffer resource in buffer instructions // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions struct __attribute__((packed)) buffer_resource diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp index 97c8be5f2b..c165cacba2 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp @@ -81,6 +81,13 @@ __device__ inline auto amd_wave_read_first_lane(const Object& obj) return out; } +// Overload for host to return the same value +template +__host__ inline T amd_wave_read_first_lane(T v) +{ + return v; +} + // 128 bit SGPRs to supply buffer resource in buffer instructions // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions struct __attribute__((packed)) buffer_resource diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp index 84db819f66..48e7a03d83 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp @@ -44,15 +44,20 @@ struct BaseGemmPipelineAgBgCrCompAsync CK_TILE_HOST_DEVICE static auto TailHandler(const RunFunction& run_func, bool has_hot_loop, TailNumber tail_number) { + // Use amd_wave_read_first_lane to avoid higher resource usage. + // It forces to store these values in SGPR. + // Compiler cannot deduce if one path is used for all threads + const bool has_hot_loop_first_lane = amd_wave_read_first_lane(has_hot_loop); + const TailNumber tail_number_first_lane = amd_wave_read_first_lane(tail_number); // Handle all the valid cases. - if(has_hot_loop) + if(has_hot_loop_first_lane) { - if(tail_number == TailNumber::Three) + if(tail_number_first_lane == TailNumber::Three) { return run_func(bool_constant{}, integral_constant{}); } - else if(tail_number == TailNumber::Two) + else if(tail_number_first_lane == TailNumber::Two) { return run_func(bool_constant{}, integral_constant{}); @@ -60,12 +65,12 @@ struct BaseGemmPipelineAgBgCrCompAsync } else { - if(tail_number == TailNumber::Three) + if(tail_number_first_lane == TailNumber::Three) { return run_func(bool_constant{}, integral_constant{}); } - else if(tail_number == TailNumber::Two) + else if(tail_number_first_lane == TailNumber::Two) { return run_func(bool_constant{}, integral_constant{}); @@ -430,7 +435,7 @@ struct GemmPipelineAgBgCrCompAsync : public BaseGemmPipelineAgBgCrCompAsync{}], b_dram_tile_window_step); - if(HasHotLoop) + if constexpr(HasHotLoop) { // we have had 3 global prefetches so far, indexed (0, 1, 2). index_t i_global_read = amd_wave_read_first_lane(3); diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp index dbdc35d7c6..83f7f80824 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp @@ -46,6 +46,12 @@ struct BaseGemmPipelineAgBgCrCompV3 CK_TILE_HOST_DEVICE static auto TailHandler(const RunFunction& run_func, bool has_hot_loop, TailNumber tail_number) { + // Use amd_wave_read_first_lane to avoid higher resource usage. + // It forces to store these values in SGPR. + // Compiler cannot deduce if one path is used for all threads + const bool has_hot_loop_first_lane = amd_wave_read_first_lane(has_hot_loop); + const TailNumber tail_number_first_lane = amd_wave_read_first_lane(tail_number); + constexpr auto scenarios = []() { if constexpr(Problem::BlockGemmShape::NumWarps == 8) return std::array, 5>{ @@ -62,7 +68,8 @@ struct BaseGemmPipelineAgBgCrCompV3 std::make_pair(false, TailNumber::Even), }; }(); - if(has_hot_loop == scenarios[I].first && tail_number == scenarios[I].second) + if(has_hot_loop_first_lane == scenarios[I].first && + tail_number_first_lane == scenarios[I].second) return run_func(bool_constant{}, constant{}); else if constexpr(I + 1 < scenarios.size()) return TailHandler(run_func, has_hot_loop, tail_number); diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp index 7c2ab080bd..2c94648e66 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp @@ -47,15 +47,20 @@ struct BaseGemmPipelineAgBgCrCompV4 CK_TILE_HOST_DEVICE static auto TailHandler(const RunFunction& run_func, bool has_hot_loop, TailNumber tail_number) { + // Use amd_wave_read_first_lane to avoid higher resource usage. + // It forces to store these values in SGPR. + // Compiler cannot deduce if one path is used for all threads + const bool has_hot_loop_first_lane = amd_wave_read_first_lane(has_hot_loop); + const TailNumber tail_number_first_lane = amd_wave_read_first_lane(tail_number); // Handle all the valid cases. - if(has_hot_loop) + if(has_hot_loop_first_lane) { - if(tail_number == TailNumber::Three) + if(tail_number_first_lane == TailNumber::Three) { return run_func(bool_constant{}, integral_constant{}); } - else if(tail_number == TailNumber::Two) + else if(tail_number_first_lane == TailNumber::Two) { return run_func(bool_constant{}, integral_constant{}); @@ -63,12 +68,12 @@ struct BaseGemmPipelineAgBgCrCompV4 } else { - if(tail_number == TailNumber::Three) + if(tail_number_first_lane == TailNumber::Three) { return run_func(bool_constant{}, integral_constant{}); } - else if(tail_number == TailNumber::Two) + else if(tail_number_first_lane == TailNumber::Two) { return run_func(bool_constant{}, integral_constant{}); diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v6.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v6.hpp index 1e92eeddf4..09d3f4df57 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v6.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v6.hpp @@ -43,15 +43,20 @@ struct BaseGemmPipelineAgBgCrCompV6 CK_TILE_HOST_DEVICE static auto TailHandler(const RunFunction& run_func, bool has_hot_loop, TailNumber tail_number) { + // Use amd_wave_read_first_lane to avoid higher resource usage. + // It forces to store these values in SGPR. + // Compiler cannot deduce if one path is used for all threads + const bool has_hot_loop_first_lane = amd_wave_read_first_lane(has_hot_loop); + const TailNumber tail_number_first_lane = amd_wave_read_first_lane(tail_number); // Handle all the valid cases. - if(has_hot_loop) + if(has_hot_loop_first_lane) { - if(tail_number == TailNumber::Odd) + if(tail_number_first_lane == TailNumber::Odd) { return run_func(bool_constant{}, integral_constant{}); } - else if(tail_number == TailNumber::Even) + else if(tail_number_first_lane == TailNumber::Even) { return run_func(bool_constant{}, integral_constant{}); @@ -59,12 +64,12 @@ struct BaseGemmPipelineAgBgCrCompV6 } else { - if(tail_number == TailNumber::Odd) + if(tail_number_first_lane == TailNumber::Odd) { return run_func(bool_constant{}, integral_constant{}); } - else if(tail_number == TailNumber::Even) + else if(tail_number_first_lane == TailNumber::Even) { return run_func(bool_constant{}, integral_constant{}); @@ -567,7 +572,7 @@ struct GemmPipelineAgBgCrCompV6 : public BaseGemmPipelineAgBgCrCompV6 BasePImpl::LocalPrefetch(a_lds_tile, a_lds_gemm_window, is_a_load_tr_v); BasePImpl::LocalPrefetch(b_lds_tile, b_lds_gemm_window, is_b_load_tr_v); - if(HasHotLoop) + if constexpr(HasHotLoop) { index_t i = 0; do diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp index 2837c41c1b..1f87ec6bff 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp @@ -93,9 +93,14 @@ struct BaseGemmPipelineAgBgCrMem CK_TILE_HOST_DEVICE static auto TailHandler(const RunFunction& run_func, bool has_hot_loop, TailNumber tail_number) { + // Use amd_wave_read_first_lane to avoid higher resource usage. + // It forces to store these values in SGPR. + // Compiler cannot deduce if one path is used for all threads + const bool has_hot_loop_first_lane = amd_wave_read_first_lane(has_hot_loop); + const TailNumber tail_number_first_lane = amd_wave_read_first_lane(tail_number); // Wrap the hot_loop dispatch first. auto tail_dispatch = [&](auto tail_num_constant) { - if(has_hot_loop) + if(has_hot_loop_first_lane) { return run_func(bool_constant{}, tail_num_constant); } @@ -106,7 +111,7 @@ struct BaseGemmPipelineAgBgCrMem }; #define CHECK_TAIL_NUMBER(TAIL_NUMBER, PREFETCH_VALUE) \ - else if(tail_number == TailNumber::TAIL_NUMBER) \ + else if(tail_number_first_lane == TailNumber::TAIL_NUMBER) \ { \ if constexpr(PrefetchStages > PREFETCH_VALUE) \ { \ @@ -114,11 +119,11 @@ struct BaseGemmPipelineAgBgCrMem } \ } // Handle all the valid cases. - if(tail_number == TailNumber::One) + if(tail_number_first_lane == TailNumber::One) { return tail_dispatch(integral_constant{}); } - else if(tail_number == TailNumber::Full) + else if(tail_number_first_lane == TailNumber::Full) { return tail_dispatch(integral_constant{}); } diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_async_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_async_v1.hpp index 848764007a..c0a1875e73 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_async_v1.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_async_v1.hpp @@ -16,6 +16,7 @@ namespace ck_tile { template struct GemmPipelineAGmemBGmemCRegAsyncV1 : public BaseGemmPipelineAGmemBGmemCRegV1 { + using Base = BaseGemmPipelineAGmemBGmemCRegV1; using PipelineImplBase = GemmPipelineAgBgCrImplBase; using AsDataType = remove_cvref_t; @@ -117,7 +118,8 @@ struct GemmPipelineAGmemBGmemCRegAsyncV1 : public BaseGemmPipelineAGmemBGmemCReg { using Base = PipelineImplBase; - template 0) + if constexpr(HasHotLoop) { - Base::LocalPrefetch(a_block_tile, a_lds_ld_window, is_a_load_tr_v); - Base::LocalPrefetch(b_block_tile, b_lds_ld_window, is_b_load_tr_v); + index_t iCounter = num_loop - 1; + while(iCounter > 0) + { + Base::LocalPrefetch(a_block_tile, a_lds_ld_window, is_a_load_tr_v); + Base::LocalPrefetch(b_block_tile, b_lds_ld_window, is_b_load_tr_v); - block_sync_lds(); + block_sync_lds(); - Base::GlobalPrefetchAsync( - a_copy_lds_window, a_tile_windows, a_dram_tile_window_step); - Base::GlobalPrefetchAsync( - b_copy_lds_window, b_tile_windows, b_dram_tile_window_step); + Base::GlobalPrefetchAsync( + a_copy_lds_window, a_tile_windows, a_dram_tile_window_step); + Base::GlobalPrefetchAsync( + b_copy_lds_window, b_tile_windows, b_dram_tile_window_step); - // GEMM i - block_gemm(c_block_tile, a_block_tile, b_block_tile); + // GEMM i + block_gemm(c_block_tile, a_block_tile, b_block_tile); - block_sync_lds_direct_load(); + block_sync_lds_direct_load(); - iCounter--; + iCounter--; + } } // tail @@ -311,12 +316,18 @@ struct GemmPipelineAGmemBGmemCRegAsyncV1 : public BaseGemmPipelineAGmemBGmemCReg index_t num_loop, void* p_smem) const { - return PipelineImpl{}.operator()(a_dram_block_window_tmp, - element_wise::PassThrough{}, - b_dram_block_window_tmp, - element_wise::PassThrough{}, - num_loop, - p_smem); + const bool has_hot_loop = Base::BlockHasHotloop(num_loop); + const auto RunPipeline = [&](auto hot_loop_) { + constexpr bool hot_loop = hot_loop_.value; + return PipelineImpl{}.template operator()( + a_dram_block_window_tmp, + element_wise::PassThrough{}, + b_dram_block_window_tmp, + element_wise::PassThrough{}, + num_loop, + p_smem); + }; + return Base::TailHandler(RunPipeline, has_hot_loop); } template {}.operator()(a_dram_block_window_tmp, - a_element_func, - b_dram_block_window_tmp, - b_element_func, - num_loop, - p_smem); + const bool has_hot_loop = Base::BlockHasHotloop(num_loop); + const auto RunPipeline = [&](auto hot_loop_) { + constexpr bool hot_loop = hot_loop_.value; + return PipelineImpl{}.template operator()(a_dram_block_window_tmp, + a_element_func, + b_dram_block_window_tmp, + b_element_func, + num_loop, + p_smem); + }; + return Base::TailHandler(RunPipeline, has_hot_loop); } }; diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp index 7a893cc6ca..942a496d33 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp @@ -19,7 +19,10 @@ struct BaseGemmPipelineAGmemBGmemCRegV1 CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; } - CK_TILE_HOST_DEVICE static constexpr bool BlockHasHotloop(index_t) { return true; } + CK_TILE_HOST_DEVICE static constexpr bool BlockHasHotloop(index_t num_loop) + { + return num_loop > PrefetchStages; + } CK_TILE_HOST_DEVICE static constexpr TailNumber GetBlockLoopTailNum(index_t) { @@ -27,9 +30,21 @@ struct BaseGemmPipelineAGmemBGmemCRegV1 } template - CK_TILE_HOST_DEVICE static auto TailHandler(const RunFunction& run_func, bool, TailNumber) + CK_TILE_HOST_DEVICE static auto TailHandler(const RunFunction& run_func, bool has_hot_loop) { - return run_func(bool_constant{}, integral_constant{}); + // Use amd_wave_read_first_lane to avoid higher resource usage. + // It forces to store these values in SGPR. + // Compiler cannot deduce if one path is used for all threads + const bool has_hot_loop_first_lane = amd_wave_read_first_lane(has_hot_loop); + + if(has_hot_loop_first_lane) + { + return run_func(ck_tile::bool_constant{}); + } + else + { + return run_func(ck_tile::bool_constant{}); + } } }; @@ -39,6 +54,7 @@ struct BaseGemmPipelineAGmemBGmemCRegV1 template struct GemmPipelineAGmemBGmemCRegV1 : public BaseGemmPipelineAGmemBGmemCRegV1 { + using Base = BaseGemmPipelineAGmemBGmemCRegV1; using PipelineImplBase = GemmPipelineAgBgCrImplBase; using AsDataType = remove_cvref_t; @@ -137,7 +153,8 @@ struct GemmPipelineAGmemBGmemCRegV1 : public BaseGemmPipelineAGmemBGmemCRegV1 0) + if constexpr(HasHotLoop) { - // global read i + 1 - elementwise_As_res = - load_tile_with_elementwise(as_copy_dram_window, a_element_func); - block_sync_lds(); - elementwise_Bs_res = - load_tile_with_elementwise(bs_copy_dram_window, b_element_func); - - block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window); - - // GEMM i - block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); - - block_sync_lds(); - - // move to i + 2 - move_tile_window(as_copy_dram_window, {0, kKPerBlock}); - move_tile_window(bs_copy_dram_window, {0, kKPerBlock}); - - // LDS write i + 1 - if constexpr(is_a_col_major) + index_t iCounter = num_loop - 1; + while(iCounter > 0) { - auto a_shuffle_tmp_loop = make_static_distributed_tensor( - Policy::template MakeShuffledARegTileDistribution()); - transpose_tile2d(a_shuffle_tmp_loop, elementwise_As_res); - store_tile(a_copy_lds_window, a_shuffle_tmp_loop); - } - else - { - store_tile(a_copy_lds_window, elementwise_As_res); - } + // global read i + 1 + elementwise_As_res = + load_tile_with_elementwise(as_copy_dram_window, a_element_func); + elementwise_Bs_res = + load_tile_with_elementwise(bs_copy_dram_window, b_element_func); + block_sync_lds(); - // LDS write i + 1 - if constexpr(is_b_row_major) - { - auto b_shuffle_tmp_loop = make_static_distributed_tensor( - Policy::template MakeShuffledBRegTileDistribution()); - transpose_tile2d(b_shuffle_tmp_loop, elementwise_Bs_res); - store_tile(b_copy_lds_window, b_shuffle_tmp_loop); - } - else - { - store_tile(b_copy_lds_window, elementwise_Bs_res); - } + block_gemm.LocalPrefetch(a_lds_gemm_window, b_lds_gemm_window); - iCounter--; + // GEMM i + block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); + + block_sync_lds(); + + // move to i + 2 + move_tile_window(as_copy_dram_window, a_dram_tile_window_step); + move_tile_window(bs_copy_dram_window, b_dram_tile_window_step); + + // LDS write i + 1 + if constexpr(is_a_col_major) + { + auto a_shuffle_tmp_loop = make_static_distributed_tensor( + Policy::template MakeShuffledARegTileDistribution()); + transpose_tile2d(a_shuffle_tmp_loop, elementwise_As_res); + store_tile(a_copy_lds_window, a_shuffle_tmp_loop); + } + else + { + store_tile(a_copy_lds_window, elementwise_As_res); + } + + // LDS write i + 1 + if constexpr(is_b_row_major) + { + auto b_shuffle_tmp_loop = make_static_distributed_tensor( + Policy::template MakeShuffledBRegTileDistribution()); + transpose_tile2d(b_shuffle_tmp_loop, elementwise_Bs_res); + store_tile(b_copy_lds_window, b_shuffle_tmp_loop); + } + else + { + store_tile(b_copy_lds_window, elementwise_Bs_res); + } + + iCounter--; + } } // tail @@ -340,7 +368,8 @@ struct GemmPipelineAGmemBGmemCRegV1 : public BaseGemmPipelineAGmemBGmemCRegV1 0) + if constexpr(HasHotLoop) { - // global read i + 1 - elementwise_As_res = - load_tile_with_elementwise(as_copy_dram_window, a_element_func); - block_sync_lds(); - elementwise_Bs_res = - load_tile_with_elementwise(bs_copy_dram_window, b_element_func); - - // GEMM i - block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); - - // move to i + 2 - move_tile_window(as_copy_dram_window, {0, kKPerBlock}); - move_tile_window(bs_copy_dram_window, {0, kKPerBlock}); - - // LDS write i + 1 - if constexpr(is_a_col_major) + index_t iCounter = num_loop - 1; + while(iCounter > 0) { - auto a_shuffle_tmp_loop = make_static_distributed_tensor( - Policy::template MakeShuffledARegTileDistribution()); - transpose_tile2d(a_shuffle_tmp_loop, elementwise_As_res); - store_tile(a_copy_lds_window, a_shuffle_tmp_loop); - } - else - { - store_tile(a_copy_lds_window, elementwise_As_res); - } + // global read i + 1 + elementwise_As_res = + load_tile_with_elementwise(as_copy_dram_window, a_element_func); + block_sync_lds(); + elementwise_Bs_res = + load_tile_with_elementwise(bs_copy_dram_window, b_element_func); - // LDS write i + 1 - if constexpr(is_b_row_major) - { - auto b_shuffle_tmp_loop = make_static_distributed_tensor( - Policy::template MakeShuffledBRegTileDistribution()); - transpose_tile2d(b_shuffle_tmp_loop, elementwise_Bs_res); - store_tile(b_copy_lds_window, b_shuffle_tmp_loop); - } - else - { - store_tile(b_copy_lds_window, elementwise_Bs_res); - } + // GEMM i + block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); - iCounter--; + // move to i + 2 + move_tile_window(as_copy_dram_window, {0, kKPerBlock}); + move_tile_window(bs_copy_dram_window, {0, kKPerBlock}); + + // LDS write i + 1 + if constexpr(is_a_col_major) + { + auto a_shuffle_tmp_loop = make_static_distributed_tensor( + Policy::template MakeShuffledARegTileDistribution()); + transpose_tile2d(a_shuffle_tmp_loop, elementwise_As_res); + store_tile(a_copy_lds_window, a_shuffle_tmp_loop); + } + else + { + store_tile(a_copy_lds_window, elementwise_As_res); + } + + // LDS write i + 1 + if constexpr(is_b_row_major) + { + auto b_shuffle_tmp_loop = make_static_distributed_tensor( + Policy::template MakeShuffledBRegTileDistribution()); + transpose_tile2d(b_shuffle_tmp_loop, elementwise_Bs_res); + store_tile(b_copy_lds_window, b_shuffle_tmp_loop); + } + else + { + store_tile(b_copy_lds_window, elementwise_Bs_res); + } + + iCounter--; + } } // tail @@ -543,13 +575,18 @@ struct GemmPipelineAGmemBGmemCRegV1 : public BaseGemmPipelineAGmemBGmemCRegV1{}.operator()( - a_dram_block_window_tmp, - [](auto& e, const ADataType & a) { e = a; }, - b_dram_block_window_tmp, - [](auto& e, const BDataType & b) { e = b; }, - num_loop, - p_smem); + const bool has_hot_loop = Base::BlockHasHotloop(num_loop); + const auto RunPipeline = [&](auto hot_loop_) { + constexpr bool hot_loop = hot_loop_.value; + return PipelineImpl{}.template operator()( + a_dram_block_window_tmp, + element_wise::PassThrough{}, + b_dram_block_window_tmp, + element_wise::PassThrough{}, + num_loop, + p_smem); + }; + return Base::TailHandler(RunPipeline, has_hot_loop); } template {}.operator()(a_dram_block_window_tmp, - a_element_func, - b_dram_block_window_tmp, - b_element_func, - num_loop, - p_smem); + const bool has_hot_loop = Base::BlockHasHotloop(num_loop); + const auto RunPipeline = [&](auto hot_loop_) { + constexpr bool hot_loop = hot_loop_.value; + return PipelineImpl{}.template operator()(a_dram_block_window_tmp, + a_element_func, + b_dram_block_window_tmp, + b_element_func, + num_loop, + p_smem); + }; + return Base::TailHandler(RunPipeline, has_hot_loop); } }; diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp index 7aa58c4763..fb82d77fe6 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp @@ -1135,6 +1135,7 @@ struct GroupedConvolutionBackwardDataKernel // allocate LDS __shared__ char smem_ptr[GetSmemSize()]; + // Disable Async for other archs than gfx950 if constexpr(GemmPipeline_::Async) { #if defined(__gfx950__) diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp index 46a0ff8b47..143c003784 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp @@ -906,6 +906,7 @@ struct GroupedConvolutionBackwardWeightKernel __shared__ char smem_ptr[GetSmemSize()]; + // Disable Async for other archs than gfx950 if constexpr(GemmPipeline_::Async) { #if defined(__gfx950__) diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp index a8f88d48c7..572f52b40d 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp @@ -1149,6 +1149,7 @@ struct GroupedConvolutionForwardKernel // allocate LDS __shared__ char smem_ptr[GetSmemSize()]; + // Disable Async for other archs than gfx950 if constexpr(GemmPipeline_::Async) { #if defined(__gfx950__) diff --git a/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp b/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp index a1231ff0f1..56f8f30785 100644 --- a/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp +++ b/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp @@ -7,6 +7,7 @@ #include "../../experimental/builder/test/utils/conv_algorithm_type_utils.hpp" #include "grouped_convolution_signatures.hpp" +#include "common.hpp" #include "ck_tile/ref/naive_grouped_conv_fwd_gpu.hpp" #include "ck_tile/builder/testing/filter_extent.hpp" @@ -96,6 +97,29 @@ auto parse_conv_args(int arg_idx, char* const argv[]) return args; } +template +void run_cpu_validation(const ckt::Args& args, + const ckt::Outputs& outputs, + const ckt::Outputs& reference) +{ + using DataType = + std::conditional_t>; + const auto conv_param = args.to_ck_tile_conv_param(); + + const std::size_t output_bytes_num = conv_param.template GetOutputByte(); + std::vector out(output_bytes_num / sizeof(DataType)); + std::vector ref(output_bytes_num / sizeof(DataType)); + HIP_CHECK_ERROR( + hipMemcpy(&ref.data()[0], reference.output, output_bytes_num, hipMemcpyDeviceToHost)); + HIP_CHECK_ERROR( + hipMemcpy(&out.data()[0], outputs.output, output_bytes_num, hipMemcpyDeviceToHost)); + ck_tile::check_err(out, ref, "Error: Incorrect results!"); +} + /// @brief `run_grouped_conv_forward_tile_algs()` run all grouped conv fwd instances. /// /// @tparam SIGNATURE Forward convolution signature. @@ -114,39 +138,19 @@ run_grouped_conv_forward_tile_algs(const ckt::Args& args, float avg_time; bool valid = true; - auto reference = ckt::alloc_outputs(args); - using ReferenceInstance = - typename ckb::ConvBuilder::Instance; - auto ref_conv = ReferenceInstance{}; - [[maybe_unused]] auto ref_result = ckt::run(ref_conv, args, inputs, reference.get()); - -#if ENABLE_BUILDER_VALIDATE == 0 using DataType = std::conditional_t>; - const auto conv_param = args.to_ck_tile_conv_param(); - const std::size_t output_bytes_num = conv_param.template GetOutputByte(); - std::vector out(output_bytes_num / sizeof(DataType)); - std::vector ref(output_bytes_num / sizeof(DataType)); - HIP_CHECK_ERROR( - hipMemcpy(&ref.data()[0], reference.get().output, output_bytes_num, hipMemcpyDeviceToHost)); - - const ck_tile::index_t GemmK = std::accumulate(conv_param.filter_spatial_lengths_.cbegin(), - conv_param.filter_spatial_lengths_.cend(), - 1, - std::multiplies()) * - conv_param.C_; - float max_accumulated_value = *std::max_element(ref.begin(), ref.end()); - const auto rtol = ck_tile::get_relative_threshold(GemmK); - const auto atol = - ck_tile::get_absolute_threshold(max_accumulated_value, GemmK); -#endif - - [[maybe_unused]] auto run_alg = [&](auto&& run_alg_func) { + auto reference = ckt::alloc_outputs(args); + using ReferenceInstance = + typename ckb::ConvBuilder::Instance; + auto ref_conv = ReferenceInstance{}; + auto ref_result = ckt::run(ref_conv, args, inputs, reference.get()); + auto run_alg = [&](auto&& run_alg_func) { std::tie(is_supported, avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); if(is_supported) { @@ -155,20 +159,27 @@ run_grouped_conv_forward_tile_algs(const ckt::Args& args, std::cout << "Perf: " << std::setw(10) << avg_time << " ms," << " " << op_name << std::endl; -#if ENABLE_BUILDER_VALIDATE - const auto errors = ckt::validate(args, outputs, reference.get()).get_errors(); - for(const auto& error : errors) + ckt::ValidationReport report; + ckt::Outputs::reflect( + args, + [&](std::string_view name, const auto& desc, void* ckt::Outputs::*ptr) { + report.check(name, + desc, + outputs.*ptr, + reference.get().*ptr, + ck::profiler::get_rtol(), + ck::profiler::get_atol()); + }); + + for(const auto& error : report.get_errors()) { valid = false; std::cout << "Number of incorrect values: " << error.wrong_elements << " Is all zero:" << error.is_all_zero() << " max err: " << error.max_error << std::endl; + // Check with cpu verification to get a values + run_cpu_validation(args, outputs, reference.get()); } -#else - HIP_CHECK_ERROR( - hipMemcpy(&out.data()[0], outputs.output, output_bytes_num, hipMemcpyDeviceToHost)); - valid = ck_tile::check_err(out, ref, "Error: Incorrect results!"); -#endif } else { diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_tile.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_tile.cpp index 06e272e727..840215cfba 100644 --- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_tile.cpp +++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_tile.cpp @@ -11,11 +11,6 @@ #include "ck_tile/host/device_prop.hpp" #include "profiler/grouped_convolution_forward_tile_algs.hpp" -// TODO: Remove limitation of conv fwd gpu reference which does not support right pad -#define CK_CONV_FWD_REF_SKIP_RIGHT_PAD_CASES 1 -// TODO: Remove this limitation after gpu reference fix -#define ENABLE_BHALF_GROUPED_CONV_FWD_TESTS 0 - static ck::index_t args_mask = 0xffff; static ck::index_t instance_index = -1; @@ -103,17 +98,6 @@ class TestGroupedConvndFwdTile : public ::testing::Test const std::vector& input_left_pads, const std::vector& input_right_pads) { -#if CK_CONV_FWD_REF_SKIP_RIGHT_PAD_CASES - bool without_right_pad = true; - for(const std::size_t& right_pad : input_right_pads) - { - without_right_pad &= right_pad == 0; - } - if(!without_right_pad) - { - return; - } -#endif ckt::Args args = { .lengths = { @@ -155,12 +139,13 @@ using KernelTypes2d = ::testing::Types, + SignatureDetails<2, + ckb::DataType::BF16, + ckb::DataType::FP32, + ckb::TensorLayout::NHWGC, + ckb::TensorLayout::GKYXC, ckb::TensorLayout::NHWGK>>; -#if ENABLE_BHALF_GROUPED_CONV_FWD_TESTS -SignatureDetails < 2, ckb::DataType::BF16, ckb::DataType::FP32, ckb::TensorLayout::NHWGC, - ckb::TensorLayout::GKYXC, ckb::TensorLayout::NHWGK >> - ; -#endif using KernelTypes3d = ::testing::Types, + SignatureDetails<3, + ckb::DataType::BF16, + ckb::DataType::FP32, + ckb::TensorLayout::NDHWGC, + ckb::TensorLayout::GKZYXC, ckb::TensorLayout::NDHWGK>>; -#if ENABLE_BHALF_GROUPED_CONV_FWD_TESTS -SignatureDetails < 3, ckb::DataType::BF16, ckb::DataType::FP32, ckb::TensorLayout::NDHWGC, - ckb::TensorLayout::GKZYXC, ckb::TensorLayout::NDHWGK >> - ; -#endif template class TestGroupedConvndFwdTile2d : public TestGroupedConvndFwdTile