diff --git a/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp b/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp index cbadd9cf76..020f047d1a 100644 --- a/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp +++ b/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp @@ -8,7 +8,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp" diff --git a/client_example/23_elementwise_transpose/elementwise_transpose_3d.cpp b/client_example/23_elementwise_transpose/elementwise_transpose_3d.cpp index 82d7de2a7d..21602b19bd 100644 --- a/client_example/23_elementwise_transpose/elementwise_transpose_3d.cpp +++ b/client_example/23_elementwise_transpose/elementwise_transpose_3d.cpp @@ -6,7 +6,7 @@ #include #include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/tensor_operation_instance/gpu/transpose_3d.hpp" diff --git a/example/09_convnd_fwd/CMakeLists.txt b/example/09_convnd_fwd/CMakeLists.txt index 8a295d14c4..c576798273 100644 --- a/example/09_convnd_fwd/CMakeLists.txt +++ b/example/09_convnd_fwd/CMakeLists.txt @@ -3,7 +3,8 @@ add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp) add_example_executable(example_convnd_fwd_xdl_bf16 convnd_fwd_xdl_bf16.cpp) add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp) add_example_executable(example_convnd_fwd_xdl_fp8 convnd_fwd_xdl_fp8.cpp) -add_example_executable(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp) +# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed +add_example_executable_no_testing(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp) add_example_executable(example_convnd_fwd_xdl_bf8 convnd_fwd_xdl_bf8.cpp) add_example_executable(example_convnd_fwd_xdl_fp16_comp_fp8 convnd_fwd_xdl_fp16_comp_fp8.cpp) add_example_executable(example_convnd_fwd_xdl_fp8_bf8 convnd_fwd_xdl_fp8_bf8.cpp) diff --git a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp index 24c8d82f67..1e7899b35d 100644 --- a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp +++ b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp @@ -1,12 +1,12 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" @@ -27,7 +27,12 @@ using DeviceElementwiseAddInstance = ck::Tuple, Add, 2, + 64, + 64, + 64, 8, + 8, + ck::Sequence<1, 0>, ck::Sequence<8, 8>, ck::Sequence<8>>; diff --git a/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp b/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp index 3c04c56140..5f321e6284 100644 --- a/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp +++ b/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp @@ -1,12 +1,12 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp" #include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" @@ -27,9 +27,14 @@ using DeviceElementwiseAddInstance = ck::Tuple, Add, 3, - 8, - ck::Sequence<1, 8>, - ck::Sequence<8>>; + 64, + 16, + 16, + 2, + 2, + ck::Sequence<1, 0>, + ck::Sequence<1, 2>, + ck::Sequence<2>>; template void host_broadcast3D_am_bmnk(HostTensorC& C, diff --git a/example/19_binary_elementwise/elementwise_add_1d.cpp b/example/19_binary_elementwise/elementwise_add_1d.cpp index 1ac09641a1..90e2d28d95 100644 --- a/example/19_binary_elementwise/elementwise_add_1d.cpp +++ b/example/19_binary_elementwise/elementwise_add_1d.cpp @@ -1,11 +1,11 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include #include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" @@ -25,9 +25,14 @@ using DeviceElementwiseAddInstance = ck::Tuple, Add, 1, - 8, - ck::Sequence<8, 8>, - ck::Sequence<8>>; + 64, + 16, + 16, + 2, + 2, + ck::Sequence<1, 0>, + ck::Sequence<2, 2>, + ck::Sequence<2>>; template void host_elementwise1D( diff --git a/example/19_binary_elementwise/elementwise_add_4d.cpp b/example/19_binary_elementwise/elementwise_add_4d.cpp index e571aa8468..797521dcb4 100644 --- a/example/19_binary_elementwise/elementwise_add_4d.cpp +++ b/example/19_binary_elementwise/elementwise_add_4d.cpp @@ -1,12 +1,12 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp" #include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" @@ -27,9 +27,14 @@ using DeviceElementwiseAddInstance = ck::Tuple, Add, 4, - 8, - ck::Sequence<8, 8>, - ck::Sequence<8>>; + 64, + 2, + 128, + 2, + 2, + ck::Sequence<1, 0>, + ck::Sequence<2, 2>, + ck::Sequence<2>>; template void host_elementwise4D(HostTensorC& C, diff --git a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp index 96d04dcb37..5dccb11bba 100644 --- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp +++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -9,7 +9,7 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/device_memory.hpp" @@ -103,9 +103,14 @@ using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwiseI ck::Tuple, // y NormalizeFunctor, 2, - 8, // MPerthread - ck::Sequence<8, 1, 1, 8, 8>, // scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta - ck::Sequence<8>>; // scalarPerVector: y(layerNorm_out) + 64, // BlockSize + 16, // MPerBlock + 16, // NPerBlock + 2, // MPerthread + 2, // NPerthread + ck::Sequence<1, 0>, // ThreadClusterArrangeOrder + ck::Sequence<2, 1, 1, 2, 2>, // scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta + ck::Sequence<2>>; // scalarPerVector: y(layerNorm_out) auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { return HostTensorDescriptor({len}, {stride}); diff --git a/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp b/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp index bd1d6932ac..168193ad5b 100644 --- a/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp +++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -9,7 +9,7 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/device_memory.hpp" @@ -102,9 +102,14 @@ using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwiseI ck::Tuple, // y NormalizeFunctor, 2, - 8, // MPerthread - ck::Sequence<8, 1, 1, 8, 8>, // scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta - ck::Sequence<8>>; // scalarPerVector: y(layerNorm_out) + 64, // BlockSize + 16, // MPerBlock + 16, // NPerBlock + 2, // MPerthread + 2, // NPerthread + ck::Sequence<1, 0>, // ThreadClusterArrangeOrder + ck::Sequence<2, 1, 1, 2, 2>, // scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta + ck::Sequence<2>>; // scalarPerVector: y(layerNorm_out) auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { return HostTensorDescriptor({len}, {stride}); diff --git a/example/34_batchnorm/batchnorm_infer_impl.hpp b/example/34_batchnorm/batchnorm_infer_impl.hpp index d0b545b2a3..ac6a0e451d 100644 --- a/example/34_batchnorm/batchnorm_infer_impl.hpp +++ b/example/34_batchnorm/batchnorm_infer_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -10,7 +10,7 @@ #include "ck/utility/sequence.hpp" #include "ck/utility/tuple.hpp" #include "ck/utility/reduction_operator.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp" #include "batchnorm_common.hpp" @@ -54,7 +54,12 @@ int bnorm_infer( ck::Tuple, // y NormalizeInInfer, Rank, - 2, // MPerthread + 64, // BlockSize + 32, // MPerBlock + 32, // NPerBlock + 4, // MPerthread + 4, // NPerthread + ck::Sequence<1, 0>, // ThreadClusterArrangeOrder ck::Sequence<1, 1, 1, 1, 1>, // x, mean, variance, scale, bias ck::Sequence<1>>; // scalarPerVector: y diff --git a/example/44_elementwise_permute/CMakeLists.txt b/example/44_elementwise_permute/CMakeLists.txt index 3cf4812509..afbf948683 100644 --- a/example/44_elementwise_permute/CMakeLists.txt +++ b/example/44_elementwise_permute/CMakeLists.txt @@ -1,12 +1,7 @@ add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp) -add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp) add_example_executable(example_elementwise_permute_4D_fp32_row elementwise_permute_4D_fp32_row.cpp) add_example_executable(example_elementwise_permute_4D_fp16_row elementwise_permute_4D_fp16_row.cpp) add_example_executable(example_elementwise_permute_4D_fp32_col elementwise_permute_4D_fp32_col.cpp) add_example_executable(example_elementwise_permute_4D_fp16_col elementwise_permute_4D_fp16_col.cpp) add_example_executable(example_elementwise_binary_4D_fp16 elementwise_binary_4D_fp16.cpp) add_example_executable(example_elementwise_trinary_4D_fp16 elementwise_trinary_4D_fp16.cpp) -add_example_executable(example_elementwise_permute elementwise_permute.cpp) -if((NOT GPU_TARGETS MATCHES "gfx940") AND (NOT GPU_TARGETS MATCHES "gfx941") AND (NOT GPU_TARGETS MATCHES "gfx942")) - add_example_executable(example_elementwise_permute_3d elementwise_permute_3d.cpp) -endif() diff --git a/example/44_elementwise_permute/elementwise_permute.cpp b/example/44_elementwise_permute/elementwise_permute.cpp deleted file mode 100644 index d3c3085eb8..0000000000 --- a/example/44_elementwise_permute/elementwise_permute.cpp +++ /dev/null @@ -1,121 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" - -#include "ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp" - -#include "ck/library/utility/algorithm.hpp" -#include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" - -using F16 = ck::half_t; -using F32 = float; - -using ADataType = F16; -using BDataType = F16; - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using DeviceElementwisePermuteInstance = - ck::tensor_operation::device::DeviceElementwiseImpl, // InDataTypeTuple - ck::Tuple, // OutDataTypeTuple - PassThrough, // ElementwiseOp - 5, // NumDim - 8, // MPerThread - ck::Sequence<1>, // InScalarPerVectorSeq - ck::Sequence<1>>; // OutScalarPerVectorSeq - -int main() -{ - bool do_verification = true; - bool time_kernel = true; - - std::vector ncdhw = {16, 8, 8, 8, 8}; - std::vector ndhwc = {16, 8, 8, 8, 8}; - std::array ab_lengths; - - std::array a_strides = { - static_cast(ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]), - static_cast(ncdhw[3] * ncdhw[4]), - static_cast(ncdhw[4]), - 1, - static_cast(ncdhw[2] * ncdhw[3] * ncdhw[4])}; - - std::array b_strides = { - static_cast(ndhwc[1] * ndhwc[2] * ndhwc[3] * ndhwc[4]), - static_cast(ndhwc[2] * ndhwc[3] * ndhwc[4]), - static_cast(ndhwc[3] * ndhwc[4]), - static_cast(ndhwc[4]), - 1}; - ck::ranges::copy(ncdhw, ab_lengths.begin()); - - std::array, 1> as = {Tensor(ab_lengths, a_strides)}; - Tensor& a = as[0]; - Tensor b(ab_lengths, b_strides); - - a.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - - DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize()); - DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize()); - - a_device_buf.ToDevice(a.mData.data()); - - std::array input = {a_device_buf.GetDeviceBuffer()}; - std::array output = {b_device_buf.GetDeviceBuffer()}; - - auto broadcastPermute = DeviceElementwisePermuteInstance{}; - auto argument = broadcastPermute.MakeArgumentPointer( - ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{}); - - if(!broadcastPermute.IsSupportedArgument(argument.get())) - { - throw std::runtime_error( - "The runtime parameters seems not supported by the device instance, exiting!"); - }; - - std::cout << "A (ncdhw): " << a.mDesc << std::endl; - std::cout << "B (ndhwc): " << b.mDesc << std::endl; - - auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer(); - float ave_time = - broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel}); - std::size_t flop = std::size_t(2) * ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]; - - std::size_t num_btype = - sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) + - sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]); - - float tflops = static_cast(flop) / 1.E9 / ave_time; - - float gb_per_sec = num_btype / 1.E6 / ave_time; - - std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" - << std::endl; - - bool pass = true; - - if(do_verification) - { - Tensor host_b(ab_lengths, b_strides); - using ReferenceElementwiseInstance = - ck::tensor_operation::host::ReferenceElementwise<1, ADataType, BDataType, PassThrough>; - auto ref_elementwise = ReferenceElementwiseInstance{}; - auto ref_invoker = ref_elementwise.MakeInvoker(); - - auto ref_argument = ref_elementwise.MakeArgument(as, host_b, PassThrough{}); - ref_invoker.Run(ref_argument); - - b_device_buf.FromDevice(b.mData.data()); - pass &= - ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3); - } - - return pass ? 0 : 1; -} diff --git a/example/44_elementwise_permute/elementwise_permute_3d.cpp b/example/44_elementwise_permute/elementwise_permute_3d.cpp deleted file mode 100644 index 47d8c4de65..0000000000 --- a/example/44_elementwise_permute/elementwise_permute_3d.cpp +++ /dev/null @@ -1,118 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp" - -#include "ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp" - -#include "ck/library/utility/algorithm.hpp" -#include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" - -using F16 = ck::half_t; -using F32 = float; - -using ADataType = F32; -using BDataType = F32; - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using DeviceElementwisePermuteInstance = - ck::tensor_operation::device::DeviceElementwise3dImpl, // InDataTypeTuple - ck::Tuple, // OutDataTypeTuple - PassThrough, // ElementwiseOp - 2, // NumDim_m, {N, C} - 2, // NumDim_n, {H, W} - 1, // NumDim_k, {D} - 4, // MPerThread - 4, // NPerThread - 4, // KPerThread - ck::Sequence<4>, // InScalarPerVectorSeq - ck::Sequence<4>>; // OutScalarPerVectorSeq - -int main() -{ - bool do_verification = true; - bool time_kernel = true; - - const int N = 4; - const int C = 16; - const int H = 32; - const int W = 5; - const int D = 16; - - std::array ab_lengths{N, C, H, W, D}; - std::array a_strides = {C * D * H * W, H * W, W, 1, D * H * W}; // N, C, D, H, W - std::array b_strides = {C * H * W * D, H * W * D, W * D, D, 1}; // N, D, H, W, C - - std::array, 1> as = {Tensor(ab_lengths, a_strides)}; - Tensor& a = as[0]; - Tensor b(ab_lengths, b_strides); - - a.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - - DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize()); - DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize()); - - a_device_buf.ToDevice(a.mData.data()); - - std::array input = {a_device_buf.GetDeviceBuffer()}; - std::array output = {b_device_buf.GetDeviceBuffer()}; - - auto broadcastPermute = DeviceElementwisePermuteInstance{}; - auto argument = broadcastPermute.MakeArgumentPointer( - ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{}); - - if(!broadcastPermute.IsSupportedArgument(argument.get())) - { - throw std::runtime_error( - "The runtime parameters seems not supported by the device instance, exiting!"); - }; - - std::cout << "A (ncdhw): " << a.mDesc << std::endl; - std::cout << "B (ndhwc): " << b.mDesc << std::endl; - - auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer(); - float ave_time = - broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel}); - std::size_t flop = std::size_t(2) * ab_lengths[0] * ab_lengths[1] * ab_lengths[2] * - ab_lengths[3] * ab_lengths[4]; - - std::size_t num_btype = - (sizeof(ADataType) + sizeof(BDataType)) * - (ab_lengths[0] * ab_lengths[1] * ab_lengths[2] * ab_lengths[3] * ab_lengths[4]); - - float tflops = static_cast(flop) / 1.E9 / ave_time; - - float gb_per_sec = num_btype / 1.E6 / ave_time; - - std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" - << std::endl; - - bool pass = true; - - if(do_verification) - { - Tensor host_b(ab_lengths, b_strides); - - using ReferenceElementwiseInstance = - ck::tensor_operation::host::ReferenceElementwise<1, ADataType, BDataType, PassThrough>; - auto ref_elementwise = ReferenceElementwiseInstance{}; - auto ref_invoker = ref_elementwise.MakeInvoker(); - - auto ref_argument = ref_elementwise.MakeArgument(as, host_b, PassThrough{}); - ref_invoker.Run(ref_argument); - - b_device_buf.FromDevice(b.mData.data()); - pass &= - ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3); - } - - return pass ? 0 : 1; -} diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp deleted file mode 100644 index 1747e6dd8b..0000000000 --- a/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp +++ /dev/null @@ -1,113 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp" - -#include "ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp" - -#include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" - -using F16 = ck::half_t; - -using ADataType = F16; -using BDataType = F16; - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using DeviceElementwisePermuteInstance = - ck::tensor_operation::device::DeviceElementwise2dImpl, // InDataTypeTuple - ck::Tuple, // OutDataTypeTuple - PassThrough, // Elementwise op - 3, // NumDim_M - 1, // NumDim_N - 1, // MPerThread - 1, // NPerThread - ck::Sequence<1>, // InScalarPerVectorSeq - ck::Sequence<1>>; // OutScalarPerVectorSeq - -int main() -{ - bool do_verification = true; - bool time_kernel = true; - - const int N = 120; - const int C = 128; - const int H = 32; - const int W = 32; - - std::array ab_lengths{N, H, W, C}; - - std::array a_strides = {C * H * W, W, 1, H * W}; - std::array b_strides = {H * W * C, W * C, C, 1}; - - std::array, 1> as = {Tensor(ab_lengths, a_strides)}; - Tensor& a = as[0]; - Tensor b(ab_lengths, b_strides); - - a.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - - DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize()); - DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize()); - - a_device_buf.ToDevice(a.mData.data()); - - std::array input = {a_device_buf.GetDeviceBuffer()}; - std::array output = {b_device_buf.GetDeviceBuffer()}; - - auto broadcastPermute = DeviceElementwisePermuteInstance{}; - auto argument = broadcastPermute.MakeArgumentPointer( - ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{}); - - if(!broadcastPermute.IsSupportedArgument(argument.get())) - { - throw std::runtime_error( - "The runtime parameters seems not supported by the device instance, exiting!"); - }; - - std::cout << "A (nchw): " << a.mDesc << std::endl; - std::cout << "B (nhwc): " << b.mDesc << std::endl; - - auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer(); - float ave_time = - broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel}); - - std::size_t flop = - std::size_t(2) * ab_lengths[0] * ab_lengths[1] * ab_lengths[2] * ab_lengths[3]; - - std::size_t num_btype = (sizeof(ADataType) + sizeof(BDataType)) * - (ab_lengths[0] * ab_lengths[1] * ab_lengths[2] * ab_lengths[3]); - - float tflops = static_cast(flop) / 1.E9 / ave_time; - - float gb_per_sec = num_btype / 1.E6 / ave_time; - - std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" - << std::endl; - - bool pass = true; - - if(do_verification) - { - Tensor host_b(ab_lengths, b_strides); - using ReferenceElementwiseInstance = - ck::tensor_operation::host::ReferenceElementwise<1, ADataType, BDataType, PassThrough>; - auto ref_elementwise = ReferenceElementwiseInstance{}; - auto ref_invoker = ref_elementwise.MakeInvoker(); - - auto ref_argument = ref_elementwise.MakeArgument(as, host_b, PassThrough{}); - ref_invoker.Run(ref_argument); - - b_device_buf.FromDevice(b.mData.data()); - pass &= - ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3); - } - - return pass ? 0 : 1; -} diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_scale.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_scale.hpp index 3b0cbc6e5f..ac6ff0a960 100644 --- a/include/ck/tensor_operation/gpu/device/device_elementwise_scale.hpp +++ b/include/ck/tensor_operation/gpu/device/device_elementwise_scale.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -13,6 +13,10 @@ namespace ck { namespace tensor_operation { namespace device { +/** + * \note This structure is deprecated (left for backwards compatibility). Please use + * DeviceElementwise from device_elementwise.hpp. + */ template {}; static constexpr auto I2 = Number<2>{}; - static constexpr auto MPerThread = Number<4>{}; + static constexpr index_t MPerThread = + MPerBlock / CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock::At(1); + static constexpr index_t NPerThread = + NPerBlock / CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock::At(3); + static constexpr auto AScalarPerVector = Number<4>{}; static constexpr auto BScalarPerVector = Number<4>{}; static constexpr auto CScalarPerVector = Number<4>{}; - template - static auto PadDescriptor_M_1d(Desc_M desc_m, index_t gridSize, index_t blockSize) + template + static auto PadDescriptor_M_N(Desc_M_N desc) { - const auto M = desc_m.GetLength(I0); - const index_t loop_step = gridSize * blockSize * MPerThread; - const auto pad = math::integer_least_multiple(M, loop_step) - M; - const auto desc_m_pad = - transform_tensor_descriptor(desc_m, - make_tuple(make_right_pad_transform(M, pad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - return desc_m_pad; + const auto M = desc.GetLength(I0); + const auto N = desc.GetLength(I1); + const auto pad_M = math::integer_divide_ceil(M, MPerThread) * MPerThread - M; + const auto pad_N = math::integer_divide_ceil(N, NPerThread) * NPerThread - N; + + const auto padded_desc = transform_tensor_descriptor( + desc, + make_tuple(make_right_pad_transform(M, pad_M), make_right_pad_transform(N, pad_N)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return padded_desc; } - static auto MakeDescriptor_M(const std::vector& lengths, - const std::vector& strides, - index_t gridSize, - index_t blockSize) + static auto MakeDescriptor_M_N(const std::vector& lengths, + const std::vector& strides) { auto tupleOfShape = generate_tuple([&](auto I) { return lengths[I]; }, Number<2>{}); auto tupleOfStride = generate_tuple([&](auto I) { return strides[I]; }, Number<2>{}); // nd desc - [s0, s1, s2, ...] - const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride); - const auto desc_m = transform_tensor_descriptor( - desc, - make_tuple(make_merge_transform(tupleOfShape)), - make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<2>{})), - make_tuple(Sequence<0>{})); - - return PadDescriptor_M_1d(desc_m, gridSize, blockSize); + const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride); + return PadDescriptor_M_N(desc); } // GridwiseGemm @@ -166,7 +165,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle CShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched>; - using CGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1)); + using CGridDesc_M_N = decltype(MakeDescriptor_M_N({1, 1}, {1, 1})); // Argument struct Argument : public tensor_operation::device::BaseArgument, public GridwiseGemm::Problem @@ -195,17 +194,13 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle p_c_grid_imag{p_c_grid_imag_}, p_aux_grid{p_workspace} { - const index_t grid_size = std::get<1>(GridwiseGemm::CalculateGridSize(M_, N_)); - if constexpr(is_same::value) { - c_grid_desc_m = - DeviceOp::MakeDescriptor_M({M_, N_}, {StrideC_, I1}, grid_size, BlockSize); + c_grid_desc_m_n = DeviceOp::MakeDescriptor_M_N({M_, N_}, {StrideC_, I1}); } else if constexpr(is_same::value) { - c_grid_desc_m = - DeviceOp::MakeDescriptor_M({M_, N_}, {I1, StrideC_}, grid_size, BlockSize); + c_grid_desc_m_n = DeviceOp::MakeDescriptor_M_N({M_, N_}, {I1, StrideC_}); } p_aux_2_grid = p_workspace + GetCElementSpaceSize(M_, N_, StrideC_); @@ -220,7 +215,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle CDataType* p_c_grid_imag; CDataType* p_aux_grid; CDataType* p_aux_2_grid; - CGridDesc_M c_grid_desc_m; + CGridDesc_M_N c_grid_desc_m_n; }; // Invoker @@ -248,40 +243,63 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle using Add = ck::tensor_operation::element_wise::Add; using Subtract = ck::tensor_operation::element_wise::Subtract; - using GridwiseBinAdd = - GridwiseElementwise_1D, - Tuple, - Tuple, - Tuple, - Add, - MPerThread, - Sequence, - Sequence>; + using Block2TileMap = BlockToCTileMap_M00_N0_M01Adapt; + + using GridwiseBinAdd = GridwiseElementwise, + Tuple, + Tuple, + Tuple, + Block2TileMap, + Add, + BlockSize, + MPerBlock, + NPerBlock, + MPerThread, + NPerThread, + Sequence<0, 1>, + Sequence, + Sequence, + I1, + I1>; using GridwiseBinSubtract = - GridwiseElementwise_1D, - Tuple, - Tuple, - Tuple, - Subtract, - MPerThread, - Sequence, - Sequence>; + GridwiseElementwise, + Tuple, + Tuple, + Tuple, + Block2TileMap, + Subtract, + BlockSize, + MPerBlock, + NPerBlock, + MPerThread, + NPerThread, + Sequence<0, 1>, + Sequence, + Sequence, + I1, + I1>; - const auto add_kernel = kernel_elementwise_1d, - Tuple, - Tuple, - Tuple, - Add>; + const index_t M = arg.c_grid_desc_m_n.GetLength(I0); + const index_t N = arg.c_grid_desc_m_n.GetLength(I1); + const auto block_2_tile_map = Block2TileMap(M, N); + + const auto add_kernel = kernel_elementwise, + Tuple, + Tuple, + Tuple, + Block2TileMap, + Add>; const auto subtract_kernel = - kernel_elementwise_1d, - Tuple, - Tuple, - Tuple, - Subtract>; + kernel_elementwise, + Tuple, + Tuple, + Tuple, + Block2TileMap, + Subtract>; if(GridwiseGemm::CalculateHasMainKBlockLoop(K)) { @@ -318,11 +336,12 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle dim3(gdx, gdy, gdz), dim3(BlockSize), 0, - make_tuple(arg.c_grid_desc_m, arg.c_grid_desc_m), - make_tuple(arg.c_grid_desc_m), + make_tuple(arg.c_grid_desc_m_n, arg.c_grid_desc_m_n), + make_tuple(arg.c_grid_desc_m_n), make_tuple(const_cast(arg.p_aux_grid), const_cast(arg.p_aux_2_grid)), make_tuple(arg.p_c_grid_real), + block_2_tile_map, Subtract{}); ave_time += launch_and_time_kernel(stream_config, @@ -352,11 +371,12 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle dim3(gdx, gdy, gdz), dim3(BlockSize), 0, - make_tuple(arg.c_grid_desc_m, arg.c_grid_desc_m), - make_tuple(arg.c_grid_desc_m), + make_tuple(arg.c_grid_desc_m_n, arg.c_grid_desc_m_n), + make_tuple(arg.c_grid_desc_m_n), make_tuple(const_cast(arg.p_aux_grid), const_cast(arg.p_aux_2_grid)), make_tuple(arg.p_c_grid_imag), + block_2_tile_map, Add{}); } else @@ -394,11 +414,12 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle dim3(gdx, gdy, gdz), dim3(BlockSize), 0, - make_tuple(arg.c_grid_desc_m, arg.c_grid_desc_m), - make_tuple(arg.c_grid_desc_m), + make_tuple(arg.c_grid_desc_m_n, arg.c_grid_desc_m_n), + make_tuple(arg.c_grid_desc_m_n), make_tuple(const_cast(arg.p_aux_grid), const_cast(arg.p_aux_2_grid)), make_tuple(arg.p_c_grid_real), + block_2_tile_map, Subtract{}); ave_time += launch_and_time_kernel(stream_config, @@ -428,11 +449,12 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle dim3(gdx, gdy, gdz), dim3(BlockSize), 0, - make_tuple(arg.c_grid_desc_m, arg.c_grid_desc_m), - make_tuple(arg.c_grid_desc_m), + make_tuple(arg.c_grid_desc_m_n, arg.c_grid_desc_m_n), + make_tuple(arg.c_grid_desc_m_n), make_tuple(const_cast(arg.p_aux_grid), const_cast(arg.p_aux_2_grid)), make_tuple(arg.p_c_grid_imag), + block_2_tile_map, Add{}); } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp deleted file mode 100644 index 02ef29e32d..0000000000 --- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp +++ /dev/null @@ -1,338 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include -#include - -#include "ck/utility/math.hpp" -#include "ck/utility/sequence.hpp" -#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp" -#include "ck/tensor_description/tensor_descriptor_helper.hpp" - -#include "ck/host_utility/kernel_launch.hpp" -#include "ck/host_utility/stream_utility.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -template -struct DeviceElementwise2dImpl : public DeviceElementwise -{ - static constexpr index_t NumDim = NumDim_m + NumDim_n; - - static constexpr int NumInput = InDataTypeTuple::Size(); - static constexpr int NumOutput = OutDataTypeTuple::Size(); - - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - - static_assert(NumInput == InScalarPerVectorSeq::Size() && - NumOutput == OutScalarPerVectorSeq::Size(), - "Tuple size is inconsistent with the number of in/out!"); - - static auto GenerateInDataTypePointerTuple() - { - return generate_tuple( - [&](auto I) { - using DataType = remove_cvref_t; - - return static_cast(nullptr); - }, - Number{}); - }; - - static auto GenerateOutDataTypePointerTuple() - { - return generate_tuple( - [&](auto I) { - using DataType = remove_cvref_t; - - return static_cast(nullptr); - }, - Number{}); - }; - - using InDataTypePointerTuple = decltype(GenerateInDataTypePointerTuple()); - using OutDataTypePointerTuple = decltype(GenerateOutDataTypePointerTuple()); - - template - static auto PadDescriptor_MN_2d(Desc_MN desc_mn, - index_t gridSize, - index_t blockSize, - index_t num_threads_m, - index_t num_threads_n) - { - std::ignore = blockSize; - std::ignore = gridSize; - const auto m = desc_mn.GetLength(I0); - const auto n = desc_mn.GetLength(I1); - const index_t loop_step_m = num_threads_m * MPerThread; - const index_t loop_step_n = num_threads_n * NPerThread; - const auto pad_m = math::integer_least_multiple(m, loop_step_m) - m; - const auto pad_n = math::integer_least_multiple(n, loop_step_n) - n; - - const auto desc_mn_pad = transform_tensor_descriptor( - desc_mn, - make_tuple(make_right_pad_transform(m, pad_m), make_right_pad_transform(n, pad_n)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - return desc_mn_pad; - } - - static auto MakeDescriptor_MN(const std::array& lengths, - const std::array& stride, - index_t gridSize, - index_t blockSize, - index_t num_threads_m, - index_t num_threads_n) - { - auto tupleOfShape = generate_tuple([&](auto I) { return lengths[I]; }, Number{}); - auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number{}); - - // nd desc - [s0, s1, s2, ...] - const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride); - - constexpr auto mDimIds = typename arithmetic_sequence_gen<0, NumDim_m, 1>::type(); - constexpr auto nDimIds = - typename arithmetic_sequence_gen::type(); - - const auto mLengths = get_container_subset(tupleOfShape, mDimIds); - const auto nLengths = get_container_subset(tupleOfShape, nDimIds); - - // merge nd to 2d desc - [s0 * s1 * ...] - - if constexpr(NumDim > 2) - { - const auto desc_mn = transform_tensor_descriptor( - desc, - make_tuple(make_merge_transform(mLengths), make_merge_transform(nLengths)), - make_tuple(mDimIds, nDimIds), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - return PadDescriptor_MN_2d(desc_mn, gridSize, blockSize, num_threads_m, num_threads_n); - } - else - return PadDescriptor_MN_2d(desc, gridSize, blockSize, num_threads_m, num_threads_n); - } - - template - static auto GenerateInOutGrid2dDescTuple(Number) - { - return generate_tuple( - [&](auto) { - if constexpr(NumDim > 2) - { - return MakeDescriptor_MN({1, 1}, {1, 1}, 1, 1, 1, 1); - } - else - { - return MakeDescriptor_MN({1}, {1}, 1, 1, 1, 1); - }; - }, - Number{}); - }; - - using OutGrid2dDescTuple = decltype(GenerateInOutGrid2dDescTuple(Number{})); - using InGrid2dDescTuple = decltype(GenerateInOutGrid2dDescTuple(Number{})); - - using GridwiseElementwise = GridwiseElementwise_2D; - - struct Argument : public BaseArgument - { - Argument(const std::array lengths, - const std::array, NumInput> inStridesArray, - const std::array, NumOutput> outStridesArray, - const std::array in_dev_buffers, - const std::array out_dev_buffers, - ElementwiseOperation elementwise_op) - - : lengths_(lengths), - inStridesArray_(inStridesArray), - outStridesArray_(outStridesArray), - elementwise_op_(elementwise_op), - blockSize_(256) - { - static_assert(NumDim_m > 0, ""); - static_assert(NumDim_n > 0, ""); - - in_dev_buffers_ = generate_tuple( - [&](auto I) { - using DataType = remove_cvref_t; - return static_cast(in_dev_buffers[I.value]); - }, - Number{}); - - out_dev_buffers_ = generate_tuple( - [&](auto I) { - using DataType = remove_cvref_t; - return static_cast(out_dev_buffers[I.value]); - }, - Number{}); - } - - InDataTypePointerTuple in_dev_buffers_; - OutDataTypePointerTuple out_dev_buffers_; - - std::array lengths_; - std::array, NumInput> inStridesArray_; - std::array, NumOutput> outStridesArray_; - - ElementwiseOperation elementwise_op_; - index_t blockSize_; - }; - - struct Invoker : public BaseInvoker - { - float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) - { - index_t gridSize = getAvailableComputeUnitCount(stream_config); - index_t num_threads_m = (gridSize * arg.blockSize_) / 16; - index_t num_threads_n = 16; - - auto in_grid_2d_desc_tuple = generate_tuple( - [&](auto I) { - return MakeDescriptor_MN(arg.lengths_, - arg.inStridesArray_[I.value], - gridSize, - arg.blockSize_, - num_threads_m, - num_threads_n); - }, - Number{}); - - auto out_grid_2d_desc_tuple = generate_tuple( - [&](auto I) { - return MakeDescriptor_MN(arg.lengths_, - arg.outStridesArray_[I.value], - gridSize, - arg.blockSize_, - num_threads_m, - num_threads_n); - }, - Number{}); - - const auto kernel = kernel_elementwise_2d; - - float elapsed_time = launch_and_time_kernel(stream_config, - kernel, - dim3(gridSize), - dim3(arg.blockSize_), - 0, - in_grid_2d_desc_tuple, - out_grid_2d_desc_tuple, - arg.in_dev_buffers_, - arg.out_dev_buffers_, - arg.elementwise_op_, - num_threads_m, - num_threads_n); - return elapsed_time; - } - - // polymorphic - float Run(const BaseArgument* p_arg, - const StreamConfig& stream_config = StreamConfig{}) override - { - return Run(*dynamic_cast(p_arg), stream_config); - } - }; - - bool IsSupportedArgument(const BaseArgument* p_arg) override - { - const Argument* pArg = dynamic_cast(p_arg); - - if(pArg == nullptr) - return false; - - if(pArg->lengths_.back() % MPerThread != 0) - return false; - - auto IsScalarPerVectorValid = [&](const std::array& lengths, - const std::array& strides, - index_t scalarPerVector, - index_t vectorDim) { - if(strides[vectorDim] == 1 && - (lengths[vectorDim] % scalarPerVector == 0 || - lengths[vectorDim] % scalarPerVector == lengths[vectorDim])) - { - return true; - } - if(strides[vectorDim] != 1 && scalarPerVector == strides[vectorDim]) - { - return true; - } - return false; - }; - - bool valid = true; - static_for<0, NumInput, 1>{}([&](auto I) { - if(!IsScalarPerVectorValid(pArg->lengths_, - pArg->inStridesArray_[I.value], - InScalarPerVectorSeq::At(I), - NumDim_m - 1)) - valid = false; - }); - - static_for<0, NumOutput, 1>{}([&](auto I) { - if(!IsScalarPerVectorValid(pArg->lengths_, - pArg->outStridesArray_[I.value], - OutScalarPerVectorSeq::At(I), - NumDim - 1)) - valid = false; - }); - - return valid; - }; - - std::unique_ptr - MakeArgumentPointer(const std::array lengths, - const std::array, NumInput> inStridesArray, - const std::array, NumOutput> outStridesArray, - const std::array in_dev_buffers, - const std::array out_dev_buffers, - ElementwiseOperation elementwise_op) override - { - return std::make_unique(lengths, - inStridesArray, - outStridesArray, - in_dev_buffers, - out_dev_buffers, - elementwise_op); - } - - static auto MakeInvoker() { return Invoker{}; } - std::unique_ptr MakeInvokerPointer() override - { - return std::make_unique(); - }; -}; // namespace device - -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp deleted file mode 100644 index 67b6f87465..0000000000 --- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp +++ /dev/null @@ -1,371 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include -#include - -#include "ck/utility/math.hpp" -#include "ck/utility/sequence.hpp" -#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_3d.hpp" -#include "ck/tensor_description/tensor_descriptor_helper.hpp" - -#include "ck/host_utility/kernel_launch.hpp" -#include "ck/host_utility/device_prop.hpp" -#include "ck/host_utility/stream_utility.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -template -struct DeviceElementwise3dImpl : public DeviceElementwise -{ - static constexpr index_t NumDim = NumDim_m + NumDim_n + NumDim_k; - - static constexpr int NumInput = InDataTypeTuple::Size(); - static constexpr int NumOutput = OutDataTypeTuple::Size(); - - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - static constexpr auto I2 = Number<2>{}; - static constexpr auto I3 = Number<3>{}; - static constexpr auto I4 = Number<4>{}; - - static_assert(NumInput == InScalarPerVectorSeq::Size() && - NumOutput == OutScalarPerVectorSeq::Size(), - "Tuple size is inconsistent with the number of in/out!"); - - static auto GenerateInDataTypePointerTuple() - { - return generate_tuple( - [&](auto I) { - using DataType = remove_cvref_t; - - return static_cast(nullptr); - }, - Number{}); - } - - static auto GenerateOutDataTypePointerTuple() - { - return generate_tuple( - [&](auto I) { - using DataType = remove_cvref_t; - - return static_cast(nullptr); - }, - Number{}); - } - - using InDataTypePointerTuple = decltype(GenerateInDataTypePointerTuple()); - using OutDataTypePointerTuple = decltype(GenerateOutDataTypePointerTuple()); - - template - static auto PadDescriptor_MNK(Desc_MNK desc_mnk, - index_t gridSize, - index_t blockSize, - index_t num_threads_m, - index_t num_threads_n, - index_t num_threads_k) - { - std::ignore = blockSize; - std::ignore = gridSize; - - const auto m = desc_mnk.GetLength(I0); - const auto n = desc_mnk.GetLength(I1); - const auto k = desc_mnk.GetLength(I2); - - const index_t loop_step_m = num_threads_m * MPerThread; - const index_t loop_step_n = num_threads_n * NPerThread; - const index_t loop_step_k = num_threads_k * KPerThread; - - const auto pad_m = math::integer_least_multiple(m, loop_step_m) - m; - const auto pad_n = math::integer_least_multiple(n, loop_step_n) - n; - const auto pad_k = math::integer_least_multiple(k, loop_step_k) - k; - - const auto desc_mnk_pad = - transform_tensor_descriptor(desc_mnk, - make_tuple(make_right_pad_transform(m, pad_m), - make_right_pad_transform(n, pad_n), - make_right_pad_transform(k, pad_k)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); - return desc_mnk_pad; - } - - static auto MakeDescriptor_MNK(const std::array& lengths, - const std::array& stride, - index_t gridSize, - index_t blockSize, - index_t num_threads_m, - index_t num_threads_n, - index_t num_threads_k) - { - auto tupleOfShape = generate_tuple([&](auto I) { return lengths[I]; }, Number{}); - auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number{}); - - // nd desc - [s0, s1, s2, ...] - const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride); - - constexpr auto mDimIds = typename arithmetic_sequence_gen<0, NumDim_m, 1>::type(); - constexpr auto nDimIds = - typename arithmetic_sequence_gen::type(); - constexpr auto kDimIds = - typename arithmetic_sequence_gen::type(); - - const auto mLengths = get_container_subset(tupleOfShape, mDimIds); - const auto nLengths = get_container_subset(tupleOfShape, nDimIds); - const auto kLengths = get_container_subset(tupleOfShape, kDimIds); - - // merge nd to 3d desc - [s0 * s1 * ...] - if constexpr(NumDim > 3) - { - const auto desc_mnk = transform_tensor_descriptor( - desc, - make_tuple(make_merge_transform(mLengths), - make_merge_transform(nLengths), - make_merge_transform(kLengths)), - make_tuple(mDimIds, nDimIds, kDimIds), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); - - return PadDescriptor_MNK( - desc_mnk, gridSize, blockSize, num_threads_m, num_threads_n, num_threads_k); - } - else - return PadDescriptor_MNK( - desc, gridSize, blockSize, num_threads_m, num_threads_n, num_threads_k); - } - - template - static auto GenerateInOutGrid3dDescTuple(Number) - { - return generate_tuple( - [&](auto) { - if constexpr(NumDim > 3) - { - return MakeDescriptor_MNK({1, 1, 1}, {1, 1, 1}, 1, 1, 1, 1, 1); - } - else - { - return MakeDescriptor_MNK({1}, {1}, 1, 1, 1, 1, 1); - }; - }, - Number{}); - } - - using OutGrid3dDescTuple = decltype(GenerateInOutGrid3dDescTuple(Number{})); - using InGrid3dDescTuple = decltype(GenerateInOutGrid3dDescTuple(Number{})); - - using GridwiseElementwise = GridwiseElementwise_3D; - - struct Argument : public BaseArgument - { - Argument(const std::array lengths, - const std::array, NumInput> inStridesArray, - const std::array, NumOutput> outStridesArray, - const std::array in_dev_buffers, - const std::array out_dev_buffers, - ElementwiseOperation elementwise_op) - - : lengths_(lengths), - inStridesArray_(inStridesArray), - outStridesArray_(outStridesArray), - elementwise_op_(elementwise_op), - blockSize_(256) - { - static_assert(NumDim_m > 0, ""); - static_assert(NumDim_n > 0, ""); - static_assert(NumDim_k > 0, ""); - - in_dev_buffers_ = generate_tuple( - [&](auto I) { - using DataType = remove_cvref_t; - return static_cast(in_dev_buffers[I.value]); - }, - Number{}); - - out_dev_buffers_ = generate_tuple( - [&](auto I) { - using DataType = remove_cvref_t; - return static_cast(out_dev_buffers[I.value]); - }, - Number{}); - } - - InDataTypePointerTuple in_dev_buffers_; - OutDataTypePointerTuple out_dev_buffers_; - - std::array lengths_; - std::array, NumInput> inStridesArray_; - std::array, NumOutput> outStridesArray_; - - ElementwiseOperation elementwise_op_; - index_t blockSize_; - }; - - struct Invoker : public BaseInvoker - { - float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) - { - index_t gridSize = getAvailableComputeUnitCount(stream_config) * arg.blockSize_; - index_t num_threads_m = gridSize / (16 * 16); - index_t num_threads_n = 16; - index_t num_threads_k = 16; - - auto in_grid_3d_desc_tuple = generate_tuple( - [&](auto I) { - return MakeDescriptor_MNK(arg.lengths_, - arg.inStridesArray_[I.value], - gridSize, - arg.blockSize_, - num_threads_m, - num_threads_n, - num_threads_k); - }, - Number{}); - - auto out_grid_3d_desc_tuple = generate_tuple( - [&](auto I) { - return MakeDescriptor_MNK(arg.lengths_, - arg.outStridesArray_[I.value], - gridSize, - arg.blockSize_, - num_threads_m, - num_threads_n, - num_threads_k); - }, - Number{}); - - const auto kernel = kernel_elementwise_3d; - - float elapsed_time = launch_and_time_kernel(stream_config, - kernel, - dim3(gridSize), - dim3(arg.blockSize_), - 0, - in_grid_3d_desc_tuple, - out_grid_3d_desc_tuple, - arg.in_dev_buffers_, - arg.out_dev_buffers_, - arg.elementwise_op_, - num_threads_m, - num_threads_n, - num_threads_k); - return elapsed_time; - } - - // polymorphic - float Run(const BaseArgument* p_arg, - const StreamConfig& stream_config = StreamConfig{}) override - { - return Run(*dynamic_cast(p_arg), stream_config); - } - }; - - bool IsSupportedArgument(const BaseArgument* p_arg) override - { - if((ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" || - ck::get_device_name() == "gfx942")) - { - return false; - } - - const Argument* pArg = dynamic_cast(p_arg); - - if(pArg == nullptr) - return false; - - if(pArg->lengths_.back() % MPerThread != 0) - return false; - - auto IsScalarPerVectorValid = [&](const std::array& lengths, - const std::array& strides, - index_t scalarPerVector, - index_t vectorDim) { - if(strides[vectorDim] == 1 && - (lengths[vectorDim] % scalarPerVector == 0 || - lengths[vectorDim] % scalarPerVector == lengths[vectorDim])) - { - return true; - } - - if(strides[vectorDim] >= scalarPerVector) - { - return true; - } - return false; - }; - - bool valid = true; - static_for<0, NumInput, 1>{}([&](auto I) { - valid = valid && IsScalarPerVectorValid(pArg->lengths_, - pArg->inStridesArray_[I.value], - InScalarPerVectorSeq::At(I), - NumDim_m - 1); - }); - - static_for<0, NumOutput, 1>{}([&](auto I) { - valid = valid && IsScalarPerVectorValid(pArg->lengths_, - pArg->outStridesArray_[I.value], - OutScalarPerVectorSeq::At(I), - NumDim - 1); - }); - - return valid; - } - - std::unique_ptr - MakeArgumentPointer(const std::array lengths, - const std::array, NumInput> inStridesArray, - const std::array, NumOutput> outStridesArray, - const std::array in_dev_buffers, - const std::array out_dev_buffers, - ElementwiseOperation elementwise_op) override - { - return std::make_unique(lengths, - inStridesArray, - outStridesArray, - in_dev_buffers, - out_dev_buffers, - elementwise_op); - } - - static auto MakeInvoker() { return Invoker{}; } - std::unique_ptr MakeInvokerPointer() override - { - return std::make_unique(); - } -}; // namespace device - -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp index 4dba95e5d3..bdc6dc9981 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp @@ -9,7 +9,7 @@ #include "ck/utility/math.hpp" #include "ck/utility/sequence.hpp" #include "ck/tensor_operation/gpu/device/device_elementwise.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_dynamic_vector_dims.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp" @@ -190,7 +190,8 @@ struct DeviceElementwiseImpl ThreadClusterArrangeOrder, InScalarPerVectorSeq, OutScalarPerVectorSeq, - false>; + I1, + I0>; using GridwiseElementwiseOpSameInOutVectorDim = GridwiseElementwise; + I1, + I1>; struct Argument : public BaseArgument { diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp deleted file mode 100644 index 1a44c3ed9c..0000000000 --- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp +++ /dev/null @@ -1,327 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include -#include - -#include "ck/utility/math.hpp" -#include "ck/utility/sequence.hpp" -#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp" -#include "ck/tensor_description/tensor_descriptor_helper.hpp" - -#include "ck/host_utility/kernel_launch.hpp" -#include "ck/host_utility/stream_utility.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { - -template // Scalar per vec for each Output -struct DeviceElementwiseImpl - : public DeviceElementwise -{ - static constexpr int NumInput = InDataTypeTuple::Size(); - static constexpr int NumOutput = OutDataTypeTuple::Size(); - - static_assert(NumInput == InScalarPerVectorSeq::Size() && - NumOutput == OutScalarPerVectorSeq::Size(), - "Tuple size is inconsistent with the number of in/out!"); - - static auto GenerateInDataTypePointerTuple() - { - return generate_tuple( - [&](auto I) { - using DataType = remove_cvref_t; - - return static_cast(nullptr); - }, - Number{}); - }; - - static auto GenerateOutDataTypePointerTuple() - { - return generate_tuple( - [&](auto I) { - using DataType = remove_cvref_t; - - return static_cast(nullptr); - }, - Number{}); - }; - - using InDataTypePointerTuple = decltype(GenerateInDataTypePointerTuple()); - using OutDataTypePointerTuple = decltype(GenerateOutDataTypePointerTuple()); - - template - static auto PadDescriptor_M_1d(Desc_M desc_m, index_t gridSize, index_t blockSize) - { - constexpr auto I0 = Number<0>{}; - - const auto m = desc_m.GetLength(I0); - const index_t loop_step = gridSize * blockSize * MPerThread; - const auto pad = math::integer_least_multiple(m, loop_step) - m; - const auto desc_m_pad = - transform_tensor_descriptor(desc_m, - make_tuple(make_right_pad_transform(m, pad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - return desc_m_pad; - } - - static auto MakeDescriptor_M(const std::array& lengths, - const std::array& stride, - index_t gridSize, - index_t blockSize) - { - auto tupleOfShape = generate_tuple([&](auto I) { return lengths[I]; }, Number{}); - auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number{}); - - // nd desc - [s0, s1, s2, ...] - const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride); - - // merge nd to 1d desc - [s0 * s1 * ...] - if constexpr(NumDim > 1) - { - const auto desc_m = transform_tensor_descriptor( - desc, - make_tuple(make_merge_transform(tupleOfShape)), - make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number{})), - make_tuple(Sequence<0>{})); - - return PadDescriptor_M_1d(desc_m, gridSize, blockSize); - } - else - return PadDescriptor_M_1d(desc, gridSize, blockSize); - } - - template - static auto GenerateInOutGrid1dDescTuple(Number) - { - return generate_tuple( - [&](auto) { - if constexpr(NumDim > 1) - { - return MakeDescriptor_M({1, 1}, {1, 1}, 1, 1); - } - else - { - return MakeDescriptor_M({1}, {1}, 1, 1); - }; - }, - Number{}); - }; - - using InGrid1dDescTuple = decltype(GenerateInOutGrid1dDescTuple(Number{})); - using OutGrid1dDescTuple = decltype(GenerateInOutGrid1dDescTuple(Number{})); - - using GridwiseElementwise = GridwiseElementwise_1D; - - struct Argument : public BaseArgument - { - Argument(const std::array lengths, - const std::array, NumInput> inStridesArray, - const std::array, NumOutput> outStridesArray, - const std::array in_dev_buffers, - const std::array out_dev_buffers, - ElementwiseOperation elementwise_op) - - : lengths_(lengths), - inStridesArray_(inStridesArray), - outStridesArray_(outStridesArray), - elementwise_op_(elementwise_op), - blockSize_(256) - { - in_dev_buffers_ = generate_tuple( - [&](auto I) { - using DataType = remove_cvref_t; - return static_cast(in_dev_buffers[I.value]); - }, - Number{}); - - out_dev_buffers_ = generate_tuple( - [&](auto I) { - using DataType = remove_cvref_t; - return static_cast(out_dev_buffers[I.value]); - }, - Number{}); - } - - InDataTypePointerTuple in_dev_buffers_; - OutDataTypePointerTuple out_dev_buffers_; - - std::array lengths_; - std::array, NumInput> inStridesArray_; - std::array, NumOutput> outStridesArray_; - - ElementwiseOperation elementwise_op_; - index_t blockSize_; - }; - - struct Invoker : public BaseInvoker - { - float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) - { - index_t gridSize = getAvailableComputeUnitCount(stream_config); - - auto in_grid_1d_desc_tuple = generate_tuple( - [&](auto I) { - return MakeDescriptor_M( - arg.lengths_, arg.inStridesArray_[I.value], gridSize, arg.blockSize_); - }, - Number{}); - - auto out_grid_1d_desc_tuple = generate_tuple( - [&](auto I) { - return MakeDescriptor_M( - arg.lengths_, arg.outStridesArray_[I.value], gridSize, arg.blockSize_); - }, - Number{}); - - const auto kernel = kernel_elementwise_1d; - - float elapsed_time = launch_and_time_kernel(stream_config, - kernel, - dim3(gridSize), - dim3(arg.blockSize_), - 0, - in_grid_1d_desc_tuple, - out_grid_1d_desc_tuple, - arg.in_dev_buffers_, - arg.out_dev_buffers_, - arg.elementwise_op_); - return elapsed_time; - } - - // polymorphic - float Run(const BaseArgument* p_arg, - const StreamConfig& stream_config = StreamConfig{}) override - { - return Run(*dynamic_cast(p_arg), stream_config); - } - }; - - static bool IsSupportedArgument(const Argument& arg) - { - if(arg.lengths_.back() % MPerThread != 0) - return false; - - auto IsScalarPerVectorValid = [&](const std::array& lengths, - const std::array& strides, - index_t scalarPerVector) { - if(strides.back() == 1 && lengths.back() % scalarPerVector == 0) - return true; - - if(strides.back() != 1 && scalarPerVector == 1) - return true; - - return false; - }; - - bool valid = true; - static_for<0, NumInput, 1>{}([&](auto I) { - if(!IsScalarPerVectorValid( - arg.lengths_, arg.inStridesArray_[I.value], InScalarPerVectorSeq::At(I))) - valid = valid && false; - }); - - static_for<0, NumOutput, 1>{}([&](auto I) { - if(!IsScalarPerVectorValid( - arg.lengths_, arg.outStridesArray_[I.value], OutScalarPerVectorSeq::At(I))) - valid = valid && false; - }); - - return valid; - }; - - bool IsSupportedArgument(const BaseArgument* p_arg) override - { - return IsSupportedArgument(*dynamic_cast(p_arg)); - } - - static auto - MakeArgument(const std::array lengths, - const std::array, NumInput> inStridesArray, - const std::array, NumOutput> outStridesArray, - const std::array in_dev_buffers, - const std::array out_dev_buffers, - ElementwiseOperation elementwise_op) - { - return Argument{lengths, - inStridesArray, - outStridesArray, - in_dev_buffers, - out_dev_buffers, - elementwise_op}; - } - - std::unique_ptr - MakeArgumentPointer(const std::array lengths, - const std::array, NumInput> inStridesArray, - const std::array, NumOutput> outStridesArray, - const std::array in_dev_buffers, - const std::array out_dev_buffers, - ElementwiseOperation elementwise_op) override - { - return std::make_unique(lengths, - inStridesArray, - outStridesArray, - in_dev_buffers, - out_dev_buffers, - elementwise_op); - } - - static auto MakeInvoker() { return Invoker{}; } - std::unique_ptr MakeInvokerPointer() override - { - return std::make_unique(); - }; - - std::string GetTypeString() const override - { - auto str = std::stringstream(); - - // clang-format off - str << "DeviceElementwiseImpl<" ; - str << "NumDim_" << NumDim << ","; - str << "MPerThread_" << MPerThread << ","; - - str << "InScalarPerVector"; - static_for<0, InScalarPerVectorSeq::Size(), 1>{}([&](auto i) { str << "_" << InScalarPerVectorSeq::At(i).value; }); - str << ","; - str << "OutScalarPerVector"; - static_for<0, OutScalarPerVectorSeq::Size(), 1>{}([&](auto i) { str << "_" << OutScalarPerVectorSeq::At(i).value; }); - - str << ">"; - // clang-format on - - return str.str(); - } - -}; // namespace device - -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp index 33d70b0b88..dff0530ee0 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp @@ -19,6 +19,10 @@ namespace ck { namespace tensor_operation { namespace device { +/** + * \note This structure is deprecated (left for backwards compatibility). Please use + * DeviceElementwiseImpl from device_elementwise_dynamic_vector_dims_impl.hpp. + */ template #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp" @@ -522,7 +522,8 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle Sequence<0, 1>, decltype(MakeElementwiseInputSequence()), Sequence, - true>; + I1, + I1>; // Argument using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp index f53ec8a4e8..c532eec99a 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp @@ -814,8 +814,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle // check device if(get_device_name() == "gfx908") { - if constexpr(!(is_same_v || is_same_v || - is_same_v)) + // FIXME: re-enable fp64 when SWDEV-335738 is fixed + if constexpr(!(is_same_v || is_same_v)) { return false; } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp index 2d60c027bb..2f2bf714a7 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp @@ -19,7 +19,7 @@ #include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_dynamic_vector_dims.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include @@ -252,7 +252,8 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage Sequence<0, 1>, ElementwiseInputSequence, ck::Sequence, - true>; + I1, + I1>; // Block2CTileMap configuration parameter. static constexpr index_t B2E_M01 = 8; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_max_pool_bwd_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_max_pool_bwd_impl.hpp index e98a85defe..f86b181e7e 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_max_pool_bwd_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_max_pool_bwd_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -8,10 +8,13 @@ #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" -#include "ck/tensor_operation/gpu/device/device_max_pool_bwd.hpp" + +#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp" +#include "ck/tensor_operation/gpu/device/device_max_pool_bwd.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" #include "ck/host_utility/stream_utility.hpp" @@ -36,9 +39,10 @@ struct DeviceMaxPoolBwdImpl : public DeviceMaxPoolBwd{}; + static constexpr auto I1 = Number<1>{}; template - static auto PadDescriptor_M_1d(Desc_M desc_m, index_t loop_step) + static auto PadDescriptor_M_1d(Desc_M& desc_m, index_t loop_step) { const auto m = desc_m.GetLength(I0); const auto pad = math::integer_least_multiple(m, loop_step) - m; @@ -56,7 +60,18 @@ struct DeviceMaxPoolBwdImpl : public DeviceMaxPoolBwd + static auto ExpendDescFirstDim(Desc_M desc_m) + { + return transform_tensor_descriptor( + desc_m, + make_tuple(make_unmerge_transform(make_tuple(I1, desc_m.GetLength(I0)))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1>{})); + } + using InOutGrid1dDesc = decltype(MakeDescriptor_M(1, 1)); + using InOutGrid2dDesc = decltype(ExpendDescFirstDim(InOutGrid1dDesc{})); using GridwisePutElementSet = GridwisePutElement_1D; - using GridwiseCasting = GridwiseElementwise_1D, - Tuple, - Tuple, - Tuple, - UnaryConvert, - InOutVectorSize, - Sequence, - Sequence>; + static constexpr index_t BlockSize = 256; + static constexpr index_t MPerThread = 1; + static constexpr index_t NPerThread = InOutVectorSize; + static constexpr index_t MPerBlock = 1; + static constexpr index_t NPerBlock = BlockSize * NPerThread; + + using Block2TileMap = BlockToCTileMap_M00_N0_M01Adapt; + + using GridwiseCasting = GridwiseElementwise, + Tuple, + Tuple, + Tuple, + Block2TileMap, + UnaryConvert, + BlockSize, + MPerBlock, + NPerBlock, + MPerThread, + NPerThread, + Sequence<0, 1>, + Sequence, + Sequence, + I1, + I1>; struct Argument : public BaseArgument { @@ -98,7 +129,7 @@ struct DeviceMaxPoolBwdImpl : public DeviceMaxPoolBwd; const auto cast_kernel = - kernel_elementwise_1d, - Tuple, - Tuple, - Tuple, - UnaryConvert>; + kernel_elementwise, + Tuple, + Tuple, + Tuple, + Block2TileMap, + UnaryConvert>; float elapsed_time = launch_and_time_kernel( stream_config, @@ -214,16 +246,25 @@ struct DeviceMaxPoolBwdImpl : public DeviceMaxPoolBwd(arg.p_workspace_), PassThrough{}); + InOutGrid2dDesc din_grid_desc_2d = ExpendDescFirstDim(din_grid_desc); + const index_t M = din_grid_desc_2d.GetLength(I0); + const index_t N = din_grid_desc_2d.GetLength(I1); + const auto block_2_tile_map = Block2TileMap(M, N); + const auto cast_kernel_grid_size = + block_2_tile_map.CalculateGridSize(din_grid_desc_2d); + elapsed_time += launch_and_time_kernel( stream_config, cast_kernel, - dim3(gridSize), + dim3(cast_kernel_grid_size), dim3(arg.blockSize_), 0, - ck::make_tuple(din_grid_desc), - ck::make_tuple(din_grid_desc), - static_cast(arg.p_workspace_), - arg.p_din_, + ck::make_tuple(din_grid_desc_2d), + ck::make_tuple(din_grid_desc_2d), + ck::make_tuple( + static_cast(arg.p_workspace_)), + ck::make_tuple(arg.p_din_), + block_2_tile_map, UnaryConvert{}); return elapsed_time; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp deleted file mode 100644 index d686c14b35..0000000000 --- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp +++ /dev/null @@ -1,195 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/tensor_description/cluster_descriptor.hpp" -#include "ck/utility/data_type.hpp" -#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -namespace ck { - -template -__global__ void kernel_elementwise_1d(const InGrid1dDescTuple in_grid_1d_desc_tuple, - const OutGrid1dDescTuple out_grid_1d_desc_tuple, - const InDataTypePointerTuple p_in_global_tuple, - const OutDataTypePointerTuple p_out_global_tuple, - const ElementwiseOperation elementwise_op) -{ - GridwiseElementwise1dFunctor::Run(in_grid_1d_desc_tuple, - out_grid_1d_desc_tuple, - p_in_global_tuple, - p_out_global_tuple, - elementwise_op); -} - -template -struct GridwiseElementwise_1D -{ - static constexpr index_t NumInput = InDataTypePointerTuple::Size(); - static constexpr index_t NumOutput = OutDataTypePointerTuple::Size(); - - static_assert(NumInput == InScalarPerVectorSeq::Size() && - NumOutput == OutScalarPerVectorSeq::Size() && - NumInput == InGrid1dDescTuple::Size() && - NumOutput == OutGrid1dDescTuple::Size(), - "Tuple size is inconsistent with the number of in/out!"); - - static constexpr auto I0 = Number<0>{}; - - static constexpr auto thread_buffer_desc_m = - make_naive_tensor_descriptor_packed(make_tuple(Number{})); - - using PassThroughOp = tensor_operation::element_wise::PassThrough; - - __device__ static void Run(const InGrid1dDescTuple in_grid_1d_desc_tuple, - const OutGrid1dDescTuple out_grid_1d_desc_tuple, - const InDataTypePointerTuple p_in_global_tuple, - const OutDataTypePointerTuple p_out_global_tuple, - const ElementwiseOperation elementwise_op) - { - const index_t thread_global_id = get_thread_global_1d_id(); - - auto in_thread_buf_tuple = generate_tuple( - [&](auto I) { - using DataTypePointer = remove_cvref_t; - using DataType = remove_cv_t>; - - return StaticBuffer{}; - }, - Number{}); - - auto out_thread_buf_tuple = generate_tuple( - [&](auto I) { - using DataTypePointer = remove_cvref_t; - using DataType = remove_pointer_t; - - return StaticBuffer{}; - }, - Number{}); - - auto in_global_buf_tuple = generate_tuple( - [&](auto I) { - static_assert(in_grid_1d_desc_tuple[I].GetNumOfDimension() == 1); - - return make_dynamic_buffer( - p_in_global_tuple[I], in_grid_1d_desc_tuple[I].GetElementSpaceSize()); - }, - Number{}); - - auto out_global_buf_tuple = generate_tuple( - [&](auto I) { - static_assert(out_grid_1d_desc_tuple[I].GetNumOfDimension() == 1); - - return make_dynamic_buffer( - p_out_global_tuple[I], out_grid_1d_desc_tuple[I].GetElementSpaceSize()); - }, - Number{}); - - const auto thread_global_offset = make_multi_index(thread_global_id * MPerThread); - - const index_t blockSize = get_block_size(); - const index_t blockPerGrid = get_grid_size(); - const auto M = in_grid_1d_desc_tuple[I0].GetLength(I0); - const index_t loop_step = blockPerGrid * blockSize * MPerThread; - const auto loop_step_index = make_multi_index(loop_step); - - auto in_global_load_tuple = generate_tuple( - [&](auto I) { - using DataTypePointer = remove_cvref_t; - using DataType = remove_cv_t>; - - return ThreadwiseTensorSliceTransfer_v2, // SliceLengths - Sequence<0>, // DimAccessOrder - 0, // SrcVectorDim - InScalarPerVectorSeq::At( - I), // ScalarPerVector - 1, // SrcScalarStrideInVector - false>{in_grid_1d_desc_tuple[I], - thread_global_offset}; - }, - Number{}); - - auto out_global_store_tuple = generate_tuple( - [&](auto I) { - using DataTypePointer = remove_cvref_t; - using DataType = remove_pointer_t; - - return ThreadwiseTensorSliceTransfer_v1r3, // SliceLengths - Sequence<0>, // DimAccessOrder - 0, // SrcVectorDim - OutScalarPerVectorSeq::At(I), - InMemoryDataOperationEnum::Set, - 1, - false>( - out_grid_1d_desc_tuple[I], thread_global_offset, PassThroughOp{}); - }, - Number{}); - - index_t num_iter = M / (loop_step); - do - { - static_for<0, NumInput, 1>{}([&](auto I) { - in_global_load_tuple(I).Run(in_grid_1d_desc_tuple[I], - in_global_buf_tuple[I], - thread_buffer_desc_m, - make_tuple(I0), - in_thread_buf_tuple(I)); - - in_global_load_tuple(I).MoveSrcSliceWindow(in_grid_1d_desc_tuple[I], - loop_step_index); - }); - - static_for<0, MPerThread, 1>{}([&](auto iM) { - // get reference to in data - const auto in_data_refs = generate_tie( - // return type should be lvalue - [&](auto I) -> const auto& { return in_thread_buf_tuple(I)(iM); }, - Number{}); - - // get reference to dst data - auto out_data_refs = generate_tie( - // return type should be lvalue - [&](auto I) -> auto& { return out_thread_buf_tuple(I)(iM); }, - Number{}); - - unpack2(elementwise_op, out_data_refs, in_data_refs); - }); - - static_for<0, NumOutput, 1>{}([&](auto I) { - out_global_store_tuple(I).Run(thread_buffer_desc_m, - make_tuple(I0), - out_thread_buf_tuple[I], - out_grid_1d_desc_tuple[I], - out_global_buf_tuple(I)); - - out_global_store_tuple(I).MoveDstSliceWindow(out_grid_1d_desc_tuple[I], - loop_step_index); - }); - } while(--num_iter); - } -}; - -} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d_scale.hpp index 48ae489f42..13e9f7bd5e 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d_scale.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d_scale.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp index bf0e8c186c..3439aefa42 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp @@ -1,229 +1,232 @@ // SPDX-License-Identifier: MIT -// // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. -// +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + #pragma once #include "ck/tensor_description/cluster_descriptor.hpp" #include "ck/utility/data_type.hpp" #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp" +#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r2.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" +#include "ck/tensor/static_tensor.hpp" +#include "ck/utility/common_header.hpp" namespace ck { -template -__global__ void kernel_elementwise_2d(const InGrid2dDescTuple in_grid_2d_desc_tuple, - const OutGrid2dDescTuple out_grid_2d_desc_tuple, - const InDataTypePointerTuple p_in_global_tuple, - const OutDataTypePointerTuple p_out_global_tuple, - const ElementwiseOperation elementwise_op, - const index_t num_threads_m, - const index_t num_threads_n) +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_elementwise(const InGridDescTuple in_grid_desc_tuple, + const OutGridDescTuple out_grid_desc_tuple, + const InDataTypePointerTuple p_in_global_tuple, + const OutDataTypePointerTuple p_out_global_tuple, + const Block2TileMap block_2_tile_map, + const ElementwiseOperation elementwise_op) { - GridwiseElementwise2dFunctor::Run(in_grid_2d_desc_tuple, - out_grid_2d_desc_tuple, - p_in_global_tuple, - p_out_global_tuple, - elementwise_op, - num_threads_m, - num_threads_n); + GridwiseElementwiseFunctor::Run(in_grid_desc_tuple, + out_grid_desc_tuple, + p_in_global_tuple, + p_out_global_tuple, + block_2_tile_map, + elementwise_op); } -template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_batched_elementwise(const InGridDescTuple in_grid_desc_tuple, + const OutGridDescTuple out_grid_desc_tuple, + const InDataTypePointerTuple p_in_global_tuple, + const OutDataTypePointerTuple p_out_global_tuple, + const Block2TileMap block_2_tile_map, + const ElementwiseOperation elementwise_op, + const index_t batch_count, + const std::array input_batch_strides, + const std::array output_batch_strides) +{ + static_assert(InGridDescTuple::Size() == NumInputs && + InDataTypePointerTuple::Size() == NumInputs); + static_assert(OutGridDescTuple::Size() == NumOutputs && + OutDataTypePointerTuple::Size() == NumOutputs); + + const index_t num_blocks_per_batch = + __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); + const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); + + InDataTypePointerTuple p_in_global_with_offset_tuple; + OutDataTypePointerTuple p_out_global_with_offset_tuple; + + static_for<0, InDataTypePointerTuple::Size(), 1>{}([&](auto i) { + p_in_global_with_offset_tuple(i) = p_in_global_tuple.At(i) + input_batch_strides[i] * g_idx; + }); + + static_for<0, OutDataTypePointerTuple::Size(), 1>{}([&](auto i) { + p_out_global_with_offset_tuple(i) = + p_out_global_tuple.At(i) + output_batch_strides[i] * g_idx; + }); + + GridwiseElementwiseFunctor::Run(in_grid_desc_tuple, + out_grid_desc_tuple, + p_in_global_with_offset_tuple, + p_out_global_with_offset_tuple, + block_2_tile_map, + elementwise_op); +} + +template -struct GridwiseElementwise_2D + typename OutScalarPerVectorSeq, + index_t SrcVectorDim, + index_t DstVectorDim> +struct GridwiseElementwise { static constexpr index_t NumInput = InDataTypePointerTuple::Size(); static constexpr index_t NumOutput = OutDataTypePointerTuple::Size(); static_assert(NumInput == InScalarPerVectorSeq::Size() && NumOutput == OutScalarPerVectorSeq::Size() && - NumInput == InGrid2dDescTuple::Size() && - NumOutput == OutGrid2dDescTuple::Size(), + NumInput == InGridDescTuple::Size() && NumOutput == OutGridDescTuple::Size(), "Tuple size is inconsistent with the number of in/out!"); static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; - static constexpr auto thread_buffer_desc_mn = - make_naive_tensor_descriptor_packed(make_tuple(Number{}, Number{})); + static_assert((SrcVectorDim == I0 || SrcVectorDim == I1) && + (DstVectorDim == I0 || DstVectorDim == I1), + "Vector dim must be equal to 0 or 1."); using PassThroughOp = tensor_operation::element_wise::PassThrough; - __device__ static void Run(const InGrid2dDescTuple in_grid_2d_desc_tuple, - const OutGrid2dDescTuple out_grid_2d_desc_tuple, - const InDataTypePointerTuple p_in_global_tuple, - const OutDataTypePointerTuple p_out_global_tuple, - const ElementwiseOperation elementwise_op, - const index_t num_threads_m, - const index_t num_threads_n) + __device__ static void Run(const InGridDescTuple& in_grid_desc_tuple, + const OutGridDescTuple& out_grid_desc_tuple, + const InDataTypePointerTuple& p_in_global_tuple, + const OutDataTypePointerTuple& p_out_global_tuple, + const Block2TileMap& block_2_tile_map, + const ElementwiseOperation& elementwise_op) { - auto in_thread_buf_tuple = generate_tuple( + + constexpr auto src_datas = generate_tuple( [&](auto I) { using DataTypePointer = remove_cvref_t; using DataType = remove_cv_t>; - return StaticBuffer{}; + return DataType{}; }, Number{}); - auto out_thread_buf_tuple = generate_tuple( + constexpr auto dst_datas = generate_tuple( [&](auto I) { using DataTypePointer = remove_cvref_t; using DataType = remove_pointer_t; - return StaticBuffer{}; + return DataType{}; }, Number{}); - auto in_global_buf_tuple = generate_tuple( + const auto in_global_buf_tuple = generate_tuple( [&](auto I) { return make_dynamic_buffer( - p_in_global_tuple[I], in_grid_2d_desc_tuple[I].GetElementSpaceSize()); + p_in_global_tuple[I], in_grid_desc_tuple[I].GetElementSpaceSize()); }, Number{}); auto out_global_buf_tuple = generate_tuple( [&](auto I) { return make_dynamic_buffer( - p_out_global_tuple[I], out_grid_2d_desc_tuple[I].GetElementSpaceSize()); + p_out_global_tuple[I], out_grid_desc_tuple[I].GetElementSpaceSize()); }, Number{}); - const auto M = in_grid_2d_desc_tuple[I0].GetLength(I0); - const auto N = in_grid_2d_desc_tuple[I0].GetLength(I1); + const auto block_work_idx = + block_2_tile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); - const index_t loop_step_m = num_threads_m * MPerThread; - const index_t loop_step_n = num_threads_n * NPerThread; - - const index_t thread_1d_id = get_thread_global_1d_id(); - index_t tid_m = thread_1d_id / num_threads_n; - index_t tid_n = thread_1d_id % num_threads_n; - - const auto thread_global_offset = make_multi_index(tid_m * MPerThread, tid_n * NPerThread); - - auto in_global_load_tuple = generate_tuple( - [&](auto I) { - using DataTypePointer = remove_cvref_t; - using DataType = remove_cv_t>; - - return ThreadwiseTensorSliceTransfer_v2< - DataType, - DataType, - decltype(in_grid_2d_desc_tuple[I]), - decltype(thread_buffer_desc_mn), - Sequence, // SliceLengths - Sequence<0, 1>, // DimAccessOrder - 0, // SrcVectorDim - InScalarPerVectorSeq::At(I), // ScalarPerVector - 1, // SrcScalarStrideInVector - true>{in_grid_2d_desc_tuple[I], thread_global_offset}; + const index_t m0_block_data_idx_on_grid = + __builtin_amdgcn_readfirstlane(block_work_idx[I0] * M0PerBlock); + const index_t m1_block_data_idx_on_grid = + __builtin_amdgcn_readfirstlane(block_work_idx[I1] * M1PerBlock); + const auto input_thread_grid_offset = generate_tuple( + [&](auto) { + return make_multi_index(m0_block_data_idx_on_grid, m1_block_data_idx_on_grid); }, Number{}); - - auto out_global_store_tuple = generate_tuple( - [&](auto I) { - using DataTypePointer = remove_cvref_t; - using DataType = remove_pointer_t; - - return ThreadwiseTensorSliceTransfer_v1r3< - DataType, - DataType, - decltype(thread_buffer_desc_mn), - decltype(out_grid_2d_desc_tuple[I]), - PassThroughOp, - Sequence, // SliceLengths - Sequence<0, 1>, // DimAccessOrder - 1, // SrcVectorDim - 1, // OutScalarPerVectorSeq::At(I), - InMemoryDataOperationEnum::Set, - 1, - true>(out_grid_2d_desc_tuple[I], thread_global_offset, PassThroughOp{}); + const auto output_thread_grid_offset = generate_tuple( + [&](auto) { + return make_multi_index(m0_block_data_idx_on_grid, m1_block_data_idx_on_grid); }, Number{}); - index_t num_iter_m = M / (loop_step_m); - do - { - index_t num_iter_n = N / (loop_step_n); - do - { - static_for<0, NumInput, 1>{}([&](auto I) { - in_global_load_tuple(I).Run(in_grid_2d_desc_tuple[I], - in_global_buf_tuple[I], - thread_buffer_desc_mn, - make_tuple(I0, I0), - in_thread_buf_tuple(I)); + using ThisThreadBlock = ThisThreadBlock; + // If src and dst have same vector dim, then: + // M0 dim - for src and dst vector load/store + // else: + // M0 dim - for dst vector load + // M1 dim - for src vector store + using SrcDimAccessOrder = + std::conditional_t, Sequence<1, 0>>; + using DstDimAccessOrder = + std::conditional_t, Sequence<1, 0>>; - in_global_load_tuple(I).MoveSrcSliceWindow(in_grid_2d_desc_tuple[I], - make_multi_index(0, loop_step_n)); - }); + using ThreadClusterLengths = + Sequence{}, Number{}>; - static_for<0, MPerThread, 1>{}([&](auto iM) { - static_for<0, NPerThread, 1>{}([&](auto iN) { - constexpr auto offset = - thread_buffer_desc_mn.CalculateOffset(make_tuple(iM, iN)); - // get reference to in data - const auto in_data_refs = generate_tie( - // return type should be lvalue - [&](auto I) -> const auto& { - return in_thread_buf_tuple(I)(Number{}); - }, - Number{}); - - // get referenec to dst data - auto out_data_refs = generate_tie( - // return type should be lvalue - [&](auto I) -> auto& { - return out_thread_buf_tuple(I)(Number{}); - }, - Number{}); - unpack2(elementwise_op, out_data_refs, in_data_refs); - }); - }); - - static_for<0, NumOutput, 1>{}([&](auto I) { - out_global_store_tuple(I).Run(thread_buffer_desc_mn, - make_tuple(I0, I0), - out_thread_buf_tuple[I], - out_grid_2d_desc_tuple[I], - out_global_buf_tuple(I)); - - out_global_store_tuple(I).MoveDstSliceWindow(out_grid_2d_desc_tuple[I], - make_multi_index(0, loop_step_n)); - }); - - } while(--num_iter_n); - - static_for<0, NumInput, 1>{}([&](auto I) { - in_global_load_tuple(I).MoveSrcSliceWindow( - in_grid_2d_desc_tuple[I], - make_multi_index(loop_step_m, -(N / loop_step_n) * loop_step_n)); - }); - - static_for<0, NumOutput, 1>{}([&](auto I) { - out_global_store_tuple(I).MoveDstSliceWindow( - out_grid_2d_desc_tuple[I], - make_multi_index(loop_step_m, -(N / loop_step_n) * loop_step_n)); - }); - } while(--num_iter_m); + auto global_to_global_transfer = ThreadGroupTensorSliceTransfer_v4r2< + ThisThreadBlock, + ElementwiseOperation, + uniform_sequence_gen_t(InMemoryDataOperationEnum::Set)>, + Sequence, + ThreadClusterLengths, + ThreadClusterArrangeOrder, + decltype(src_datas), + decltype(dst_datas), + InGridDescTuple, + OutGridDescTuple, + SrcDimAccessOrder, + DstDimAccessOrder, + SrcVectorDim, + DstVectorDim, + InScalarPerVectorSeq, + OutScalarPerVectorSeq, + uniform_sequence_gen_t, + uniform_sequence_gen_t, + uniform_sequence_gen_t, + uniform_sequence_gen_t>{in_grid_desc_tuple, + input_thread_grid_offset, + out_grid_desc_tuple, + output_thread_grid_offset, + elementwise_op}; + global_to_global_transfer.Run( + in_grid_desc_tuple, in_global_buf_tuple, out_grid_desc_tuple, out_global_buf_tuple, I0); } }; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_3d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_3d.hpp deleted file mode 100644 index 242996019b..0000000000 --- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_3d.hpp +++ /dev/null @@ -1,264 +0,0 @@ -// SPDX-License-Identifier: MIT -// // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. -// -#pragma once - -#include "ck/tensor_description/cluster_descriptor.hpp" -#include "ck/utility/data_type.hpp" -#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -namespace ck { - -template -__global__ void kernel_elementwise_3d(const InGrid3dDescTuple in_grid_3d_desc_tuple, - const OutGrid3dDescTuple out_grid_3d_desc_tuple, - const InDataTypePointerTuple p_in_global_tuple, - const OutDataTypePointerTuple p_out_global_tuple, - const ElementwiseOperation elementwise_op, - const index_t num_threads_m, - const index_t num_threads_n, - const index_t num_threads_k) -{ - GridwiseElementwise3dFunctor::Run(in_grid_3d_desc_tuple, - out_grid_3d_desc_tuple, - p_in_global_tuple, - p_out_global_tuple, - elementwise_op, - num_threads_m, - num_threads_n, - num_threads_k); -} - -template -struct GridwiseElementwise_3D -{ - static constexpr index_t NumInput = InDataTypePointerTuple::Size(); - static constexpr index_t NumOutput = OutDataTypePointerTuple::Size(); - - static_assert(NumInput == InScalarPerVectorSeq::Size() && - NumOutput == OutScalarPerVectorSeq::Size() && - NumInput == InGrid3dDescTuple::Size() && - NumOutput == OutGrid3dDescTuple::Size(), - "Tuple size is inconsistent with the number of in/out!"); - - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - static constexpr auto I2 = Number<2>{}; - - static constexpr auto thread_buffer_desc_mnk = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{}, Number{})); - - using PassThroughOp = tensor_operation::element_wise::PassThrough; - - __device__ static void Run(const InGrid3dDescTuple in_grid_3d_desc_tuple, - const OutGrid3dDescTuple out_grid_3d_desc_tuple, - const InDataTypePointerTuple p_in_global_tuple, - const OutDataTypePointerTuple p_out_global_tuple, - const ElementwiseOperation elementwise_op, - const index_t num_threads_m, - const index_t num_threads_n, - const index_t num_threads_k) - { - auto in_thread_buf_tuple = generate_tuple( - [&](auto I) { - using DataTypePointer = remove_cvref_t; - using DataType = remove_cv_t>; - - return StaticBuffer{}; - }, - Number{}); - - auto out_thread_buf_tuple = generate_tuple( - [&](auto I) { - using DataTypePointer = remove_cvref_t; - using DataType = remove_pointer_t; - - return StaticBuffer{}; - }, - Number{}); - - auto in_global_buf_tuple = generate_tuple( - [&](auto I) { - return make_dynamic_buffer( - p_in_global_tuple[I], in_grid_3d_desc_tuple[I].GetElementSpaceSize()); - }, - Number{}); - - auto out_global_buf_tuple = generate_tuple( - [&](auto I) { - return make_dynamic_buffer( - p_out_global_tuple[I], out_grid_3d_desc_tuple[I].GetElementSpaceSize()); - }, - Number{}); - - const auto M = in_grid_3d_desc_tuple[I0].GetLength(I0); - const auto N = in_grid_3d_desc_tuple[I0].GetLength(I1); - const auto K = in_grid_3d_desc_tuple[I0].GetLength(I2); - - const index_t loop_step_m = num_threads_m * MPerThread; - const index_t loop_step_n = num_threads_n * NPerThread; - const index_t loop_step_k = num_threads_k * KPerThread; - - const index_t thread_1d_id = get_thread_global_1d_id(); - - const index_t tid_m = thread_1d_id / (num_threads_n * num_threads_k); - const index_t tid_nk = thread_1d_id % (num_threads_n * num_threads_k); - const index_t tid_n = tid_nk / num_threads_k; - const index_t tid_k = tid_nk % num_threads_k; - - const auto thread_global_offset = - make_multi_index(tid_m * MPerThread, tid_n * NPerThread, tid_k * KPerThread); - - auto in_global_load_tuple = generate_tuple( - [&](auto I) { - using DataTypePointer = remove_cvref_t; - using DataType = remove_cv_t>; - - return ThreadwiseTensorSliceTransfer_v2< - DataType, - DataType, - decltype(in_grid_3d_desc_tuple[I]), - decltype(thread_buffer_desc_mnk), - Sequence, // SliceLengths - Sequence<0, 1, 2>, // DimAccessOrder - 01, // SrcVectorDim - InScalarPerVectorSeq::At(I), // InScalarPerVectorSeq::At(I), // - // ScalarPerVector - 1, // SrcScalarStrideInVector - true>{in_grid_3d_desc_tuple[I], thread_global_offset}; - }, - Number{}); - - auto out_global_store_tuple = generate_tuple( - [&](auto I) { - using DataTypePointer = remove_cvref_t; - using DataType = remove_pointer_t; - - return ThreadwiseTensorSliceTransfer_v1r3< - DataType, - DataType, - decltype(thread_buffer_desc_mnk), - decltype(out_grid_3d_desc_tuple[I]), - PassThroughOp, - Sequence, // SliceLengths - Sequence<0, 1, 2>, // DimAccessOrder - 2, // SrcVectorDim - OutScalarPerVectorSeq::At(I), // OutScalarPerVectorSeq::At(I), - InMemoryDataOperationEnum::Set, - 1, - true>(out_grid_3d_desc_tuple[I], thread_global_offset, PassThroughOp{}); - }, - Number{}); - - index_t num_iter_m = M / (loop_step_m); - do - { - index_t num_iter_n = N / (loop_step_n); - do - { - index_t num_iter_k = K / (loop_step_k); - do - { - static_for<0, NumInput, 1>{}([&](auto I) { - in_global_load_tuple(I).Run(in_grid_3d_desc_tuple[I], - in_global_buf_tuple[I], - thread_buffer_desc_mnk, - make_tuple(I0, I0, I0), - in_thread_buf_tuple(I)); - - in_global_load_tuple(I).MoveSrcSliceWindow( - in_grid_3d_desc_tuple[I], make_multi_index(0, 0, loop_step_k)); - }); - - static_for<0, MPerThread, 1>{}([&](auto iM) { - static_for<0, NPerThread, 1>{}([&](auto iN) { - static_for<0, KPerThread, 1>{}([&](auto iK) { - constexpr auto offset = - thread_buffer_desc_mnk.CalculateOffset(make_tuple(iM, iN, iK)); - // get reference to in data - const auto in_data_refs = generate_tie( - // return type should be lvalue - [&](auto I) -> const auto& { - return in_thread_buf_tuple(I)(Number{}); - }, - Number{}); - - // get referenec to dst data - auto out_data_refs = generate_tie( - // return type should be lvalue - [&](auto I) -> auto& { - return out_thread_buf_tuple(I)(Number{}); - }, - Number{}); - unpack2(elementwise_op, out_data_refs, in_data_refs); - }); - }); - }); - - static_for<0, NumOutput, 1>{}([&](auto I) { - out_global_store_tuple(I).Run(thread_buffer_desc_mnk, - make_tuple(I0, I0, I0), - out_thread_buf_tuple[I], - out_grid_3d_desc_tuple[I], - out_global_buf_tuple(I)); - - out_global_store_tuple(I).MoveDstSliceWindow( - out_grid_3d_desc_tuple[I], make_multi_index(0, 0, loop_step_k)); - }); - } while(--num_iter_k); - - static_for<0, NumInput, 1>{}([&](auto I) { - in_global_load_tuple(I).MoveSrcSliceWindow( - in_grid_3d_desc_tuple[I], - make_multi_index(0, loop_step_n, -(K / loop_step_k) * loop_step_k)); - }); - - static_for<0, NumOutput, 1>{}([&](auto I) { - out_global_store_tuple(I).MoveDstSliceWindow( - out_grid_3d_desc_tuple[I], - make_multi_index(0, loop_step_n, -(K / loop_step_k) * loop_step_k)); - }); - - } while(--num_iter_n); - - static_for<0, NumInput, 1>{}([&](auto I) { - in_global_load_tuple(I).MoveSrcSliceWindow( - in_grid_3d_desc_tuple[I], - make_multi_index(loop_step_m, - -(N / loop_step_n) * loop_step_n, - -(K / loop_step_k) * loop_step_k)); - }); - - static_for<0, NumOutput, 1>{}([&](auto I) { - out_global_store_tuple(I).MoveDstSliceWindow( - out_grid_3d_desc_tuple[I], - make_multi_index(loop_step_m, - -(N / loop_step_n) * loop_step_n, - -(K / loop_step_k) * loop_step_k)); - }); - } while(--num_iter_m); - } -}; - -} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_dynamic_vector_dims.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_dynamic_vector_dims.hpp deleted file mode 100644 index b0c1dcd47c..0000000000 --- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_dynamic_vector_dims.hpp +++ /dev/null @@ -1,229 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/tensor_description/cluster_descriptor.hpp" -#include "ck/utility/data_type.hpp" -#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp" -#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r2.hpp" -#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" -#include "ck/tensor/static_tensor.hpp" -#include "ck/utility/common_header.hpp" - -namespace ck { - -template -__global__ void -#if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) -#endif - kernel_elementwise(const InGridDescTuple in_grid_desc_tuple, - const OutGridDescTuple out_grid_desc_tuple, - const InDataTypePointerTuple p_in_global_tuple, - const OutDataTypePointerTuple p_out_global_tuple, - const Block2TileMap block_2_tile_map, - const ElementwiseOperation elementwise_op) -{ - GridwiseElementwiseFunctor::Run(in_grid_desc_tuple, - out_grid_desc_tuple, - p_in_global_tuple, - p_out_global_tuple, - block_2_tile_map, - elementwise_op); -} - -template -__global__ void -#if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) -#endif - kernel_batched_elementwise(const InGridDescTuple in_grid_desc_tuple, - const OutGridDescTuple out_grid_desc_tuple, - const InDataTypePointerTuple p_in_global_tuple, - const OutDataTypePointerTuple p_out_global_tuple, - const Block2TileMap block_2_tile_map, - const ElementwiseOperation elementwise_op, - const index_t batch_count, - const std::array input_batch_strides, - const std::array output_batch_strides) -{ - static_assert(InGridDescTuple::Size() == NumInputs && - InDataTypePointerTuple::Size() == NumInputs); - static_assert(OutGridDescTuple::Size() == NumOutputs && - OutDataTypePointerTuple::Size() == NumOutputs); - - const index_t num_blocks_per_batch = - __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); - const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); - - InDataTypePointerTuple p_in_global_with_offset_tuple; - OutDataTypePointerTuple p_out_global_with_offset_tuple; - - static_for<0, InDataTypePointerTuple::Size(), 1>{}([&](auto i) { - p_in_global_with_offset_tuple(i) = p_in_global_tuple.At(i) + input_batch_strides[i] * g_idx; - }); - - static_for<0, OutDataTypePointerTuple::Size(), 1>{}([&](auto i) { - p_out_global_with_offset_tuple(i) = - p_out_global_tuple.At(i) + output_batch_strides[i] * g_idx; - }); - - GridwiseElementwiseFunctor::Run(in_grid_desc_tuple, - out_grid_desc_tuple, - p_in_global_with_offset_tuple, - p_out_global_with_offset_tuple, - block_2_tile_map, - elementwise_op); -} - -template -struct GridwiseElementwise -{ - static constexpr index_t NumInput = InDataTypePointerTuple::Size(); - static constexpr index_t NumOutput = OutDataTypePointerTuple::Size(); - - static_assert(NumInput == InScalarPerVectorSeq::Size() && - NumOutput == OutScalarPerVectorSeq::Size() && - NumInput == InGridDescTuple::Size() && NumOutput == OutGridDescTuple::Size(), - "Tuple size is inconsistent with the number of in/out!"); - - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - - using PassThroughOp = tensor_operation::element_wise::PassThrough; - - __device__ static void Run(const InGridDescTuple& in_grid_desc_tuple, - const OutGridDescTuple& out_grid_desc_tuple, - const InDataTypePointerTuple& p_in_global_tuple, - const OutDataTypePointerTuple& p_out_global_tuple, - const Block2TileMap& block_2_tile_map, - const ElementwiseOperation& elementwise_op) - { - - constexpr auto src_datas = generate_tuple( - [&](auto I) { - using DataTypePointer = remove_cvref_t; - using DataType = remove_cv_t>; - - return DataType{}; - }, - Number{}); - - constexpr auto dst_datas = generate_tuple( - [&](auto I) { - using DataTypePointer = remove_cvref_t; - using DataType = remove_pointer_t; - - return DataType{}; - }, - Number{}); - - const auto in_global_buf_tuple = generate_tuple( - [&](auto I) { - return make_dynamic_buffer( - p_in_global_tuple[I], in_grid_desc_tuple[I].GetElementSpaceSize()); - }, - Number{}); - - auto out_global_buf_tuple = generate_tuple( - [&](auto I) { - return make_dynamic_buffer( - p_out_global_tuple[I], out_grid_desc_tuple[I].GetElementSpaceSize()); - }, - Number{}); - - const auto block_work_idx = - block_2_tile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); - - const index_t m0_block_data_idx_on_grid = - __builtin_amdgcn_readfirstlane(block_work_idx[I0] * M0PerBlock); - const index_t m1_block_data_idx_on_grid = - __builtin_amdgcn_readfirstlane(block_work_idx[I1] * M1PerBlock); - const auto input_thread_grid_offset = generate_tuple( - [&](auto) { - return make_multi_index(m0_block_data_idx_on_grid, m1_block_data_idx_on_grid); - }, - Number{}); - const auto output_thread_grid_offset = generate_tuple( - [&](auto) { - return make_multi_index(m0_block_data_idx_on_grid, m1_block_data_idx_on_grid); - }, - Number{}); - - using ThisThreadBlock = ThisThreadBlock; - // If src and dst have same vector dim, then: - // M0 dim - for src and dst vector load/store - // else: - // M0 dim - for dst vector load - // M1 dim - for src vector store - using SrcDimAccessOrder = Sequence<0, 1>; - using DstDimAccessOrder = - std::conditional_t, Sequence<1, 0>>; - using SrcVectorDim = Number<1>; - using DstVectorDim = std::conditional_t, Number<0>>; - - using ThreadClusterLengths = - Sequence{}, Number{}>; - - auto global_to_global_transfer = ThreadGroupTensorSliceTransfer_v4r2< - ThisThreadBlock, - ElementwiseOperation, - uniform_sequence_gen_t(InMemoryDataOperationEnum::Set)>, - Sequence, - ThreadClusterLengths, - ThreadClusterArrangeOrder, - decltype(src_datas), - decltype(dst_datas), - InGridDescTuple, - OutGridDescTuple, - SrcDimAccessOrder, - DstDimAccessOrder, - SrcVectorDim{}, - DstVectorDim{}, - InScalarPerVectorSeq, - OutScalarPerVectorSeq, - uniform_sequence_gen_t, - uniform_sequence_gen_t, - uniform_sequence_gen_t, - uniform_sequence_gen_t>{in_grid_desc_tuple, - input_thread_grid_offset, - out_grid_desc_tuple, - output_thread_grid_offset, - elementwise_op}; - global_to_global_transfer.Run( - in_grid_desc_tuple, in_global_buf_tuple, out_grid_desc_tuple, out_global_buf_tuple, I0); - } -}; - -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/transpose/device_transpose_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/transpose/device_transpose_instance.hpp index 6ac0871a80..23ee5b5674 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/transpose/device_transpose_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/transpose/device_transpose_instance.hpp @@ -1,9 +1,9 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" @@ -22,18 +22,23 @@ using S = ck::Sequence; using device_transpose_f16_instances = std::tuple< // clang-format off - DeviceElementwise3dImpl, ck::Tuple, PassThrough, 2, 2, 1, 8, 8, 8, ck::Sequence<8>, ck::Sequence<8>>, - DeviceElementwise3dImpl, ck::Tuple, PassThrough, 2, 2, 1, 8, 8, 8, ck::Sequence<8>, ck::Sequence<4>>, - DeviceElementwise3dImpl, ck::Tuple, PassThrough, 2, 2, 1, 4, 4, 8, ck::Sequence<4>, ck::Sequence<4>>, - DeviceElementwise3dImpl, ck::Tuple, PassThrough, 2, 2, 1, 4, 4, 4, ck::Sequence<1>, ck::Sequence<1>> + DeviceElementwiseImpl, ck::Tuple, PassThrough, 5, 256, 128, 128, 8, 8, ck::Sequence<1, 0>, ck::Sequence<8>, ck::Sequence<8>>, + DeviceElementwiseImpl, ck::Tuple, PassThrough, 5, 64, 64, 64, 8, 8, ck::Sequence<1, 0>, ck::Sequence<8>, ck::Sequence<8>>, + DeviceElementwiseImpl, ck::Tuple, PassThrough, 5, 256, 64, 64, 4, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, + DeviceElementwiseImpl, ck::Tuple, PassThrough, 5, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, + DeviceElementwiseImpl, ck::Tuple, PassThrough, 5, 256, 64, 64, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>, + DeviceElementwiseImpl, ck::Tuple, PassThrough, 5, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>> // clang-format on >; using device_transpose_f32_instances = std::tuple< // clang-format off - DeviceElementwise3dImpl, ck::Tuple, PassThrough, 2, 2, 1, 4, 4, 4, ck::Sequence<1>, ck::Sequence<1>>, - DeviceElementwise3dImpl, ck::Tuple, PassThrough, 2, 2, 1, 4, 4, 4, ck::Sequence<4>, ck::Sequence<1>>, - DeviceElementwise3dImpl, ck::Tuple, PassThrough, 2, 2, 1, 4, 4, 4, ck::Sequence<4>, ck::Sequence<4>> + DeviceElementwiseImpl, ck::Tuple, PassThrough, 5, 256, 128, 128, 8, 8, ck::Sequence<1, 0>, ck::Sequence<8>, ck::Sequence<8>>, + DeviceElementwiseImpl, ck::Tuple, PassThrough, 5, 64, 64, 64, 8, 8, ck::Sequence<1, 0>, ck::Sequence<8>, ck::Sequence<8>>, + DeviceElementwiseImpl, ck::Tuple, PassThrough, 5, 256, 64, 64, 4, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, + DeviceElementwiseImpl, ck::Tuple, PassThrough, 5, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, + DeviceElementwiseImpl, ck::Tuple, PassThrough, 5, 256, 64, 64, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>, + DeviceElementwiseImpl, ck::Tuple, PassThrough, 5, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>> // clang-format on >; diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp index 8801c309f9..a75de2c653 100644 --- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp @@ -1,12 +1,12 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/ck.hpp" #include "ck/utility/tuple.hpp" #include "ck/utility/data_type.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { @@ -23,21 +23,21 @@ using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer; template using device_batchnorm_infer_bf16_instances = std::tuple < - // Tuple, Tuple, NormalizeOp, Rank, MPerThread, Sequence, Sequence - DeviceElementwiseImpl, Tuple, Normalize, Rank, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<2, 2, 2, 2, 2>, Sequence<2> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 2, 2, 2, 2>, Sequence<2> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<4, 1, 1, 1, 1>, Sequence<4> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 4, 4, 4, 4>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<4, 2, 2, 2, 2>, Sequence<4> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 4, 4, 4, 4>, Sequence<2> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<4, 4, 4, 4, 4>, Sequence<4> > + // Tuple, Tuple, NormalizeOp, Rank, BlockSize, MPerBlock, NPerBlock, MPerThread, NPerThread, ThreadClusterArrangerOrder, Sequence, Sequence + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 8, 8, 1, 1, ck::Sequence<1, 0>, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 16, 16, 2, 2, ck::Sequence<1, 0>, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 16, 16, 2, 2, ck::Sequence<1, 0>, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 16, 16, 2, 2, ck::Sequence<1, 0>, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 16, 16, 2, 2, ck::Sequence<1, 0>, Sequence<2, 2, 2, 2, 2>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<2, 2, 2, 2, 2>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<4, 1, 1, 1, 1>, Sequence<4> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<1, 4, 4, 4, 4>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<4, 2, 2, 2, 2>, Sequence<4> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<2, 4, 4, 4, 4>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<4, 4, 4, 4, 4>, Sequence<4> > >; // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp index b674cfc423..31a7b59dfd 100644 --- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp @@ -1,12 +1,12 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/ck.hpp" #include "ck/utility/tuple.hpp" #include "ck/utility/data_type.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { @@ -23,21 +23,21 @@ using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer; template using device_batchnorm_infer_f16_instances = std::tuple < - // Tuple, Tuple, NormalizeOp, Rank, MPerThread, Sequence, Sequence - DeviceElementwiseImpl, Tuple, Normalize, Rank, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<2, 2, 2, 2, 2>, Sequence<2> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 2, 2, 2, 2>, Sequence<2> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<4, 1, 1, 1, 1>, Sequence<4> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 4, 4, 4, 4>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<4, 2, 2, 2, 2>, Sequence<4> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 4, 4, 4, 4>, Sequence<2> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<4, 4, 4, 4, 4>, Sequence<4> > + // Tuple, Tuple, NormalizeOp, Rank, BlockSize, MPerBlock, NPerBlock, MPerThread, NPerThread, ThreadClusterArrangerOrder, Sequence, Sequence + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 8, 8, 1, 1, ck::Sequence<1, 0>, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 16, 16, 2, 2, ck::Sequence<1, 0>, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 16, 16, 2, 2, ck::Sequence<1, 0>, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 16, 16, 2, 2, ck::Sequence<1, 0>, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 16, 16, 2, 2, ck::Sequence<1, 0>, Sequence<2, 2, 2, 2, 2>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<2, 2, 2, 2, 2>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<4, 1, 1, 1, 1>, Sequence<4> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<1, 4, 4, 4, 4>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<4, 2, 2, 2, 2>, Sequence<4> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<2, 4, 4, 4, 4>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<4, 4, 4, 4, 4>, Sequence<4> > >; // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp index 05e3650887..3e08a23dca 100644 --- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp @@ -1,11 +1,11 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/ck.hpp" #include "ck/utility/tuple.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { @@ -21,21 +21,21 @@ using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer; template using device_batchnorm_infer_f32_instances = std::tuple < - // Tuple, Tuple, NormalizeOp, Rank, MPerThread, Sequence, Sequence - DeviceElementwiseImpl, Tuple, Normalize, Rank, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<2, 2, 2, 2, 2>, Sequence<2> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 2, 2, 2, 2>, Sequence<2> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<4, 1, 1, 1, 1>, Sequence<4> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 4, 4, 4, 4>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<4, 2, 2, 2, 2>, Sequence<4> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 4, 4, 4, 4>, Sequence<2> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<4, 4, 4, 4, 4>, Sequence<4> > + // Tuple, Tuple, NormalizeOp, Rank, BlockSize, MPerBlock, NPerBlock, MPerThread, NPerThread, ThreadClusterArrangerOrder, Sequence, Sequence + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 8, 8, 1, 1, ck::Sequence<1, 0>, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 16, 16, 2, 2, ck::Sequence<1, 0>, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 16, 16, 2, 2, ck::Sequence<1, 0>, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 16, 16, 2, 2, ck::Sequence<1, 0>, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 16, 16, 2, 2, ck::Sequence<1, 0>, Sequence<2, 2, 2, 2, 2>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<2, 2, 2, 2, 2>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<4, 1, 1, 1, 1>, Sequence<4> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<1, 4, 4, 4, 4>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<4, 2, 2, 2, 2>, Sequence<4> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<2, 4, 4, 4, 4>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<4, 4, 4, 4, 4>, Sequence<4> > >; // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp index 15a02af021..7b6b6fbc47 100644 --- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp @@ -1,11 +1,11 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/ck.hpp" #include "ck/utility/tuple.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { @@ -21,16 +21,16 @@ using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer; template using device_batchnorm_infer_f64_instances = std::tuple < - // Tuple, Tuple, NormalizeOp, Rank, MPerThread, Sequence, Sequence - DeviceElementwiseImpl, Tuple, Normalize, Rank, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<2, 2, 2, 2, 2>, Sequence<2> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, - DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 2, 2, 2, 2>, Sequence<2> > + // Tuple, Tuple, NormalizeOp, Rank, BlockSize, MPerBlock, NPerBlock, MPerThread, NPerThread, ThreadClusterArrangerOrder, Sequence, Sequence + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 8, 8, 1, 1, ck::Sequence<1, 0>, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 16, 16, 2, 2, ck::Sequence<1, 0>, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 16, 16, 2, 2, ck::Sequence<1, 0>, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 16, 16, 2, 2, ck::Sequence<1, 0>, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 16, 16, 2, 2, ck::Sequence<1, 0>, Sequence<2, 2, 2, 2, 2>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<2, 2, 2, 2, 2>, Sequence<2> > >; // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp index f2a5f0728a..1e9701cc7c 100644 --- a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp @@ -1,12 +1,12 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { @@ -27,16 +27,20 @@ using outputType = F16; using Normalize = ck::tensor_operation::element_wise::Normalize; using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances = std::tuple< // clang-format off - //###################|| | functor| NDim| MPerThread| | | - DeviceElementwiseImpl, Tuple, Normalize, 2, 8, Sequence<8, 1, 1, 8, 8>, Sequence<8> >, - DeviceElementwiseImpl, Tuple, Normalize, 2, 4, Sequence<4, 1, 1, 4, 4>, Sequence<4> >, - DeviceElementwiseImpl, Tuple, Normalize, 2, 2, Sequence<2, 1, 1, 2, 2>, Sequence<2> > + //###################|| | functor| NDim| BlockSize| M/NPerBlock|| M/NPerThread| ThreadClusterArrangeOrder| | | + DeviceElementwiseImpl, Tuple, Normalize, 2, 256, 128, 128, 8, 8, ck::Sequence<1, 0>, Sequence<8, 1, 1, 8, 8>, Sequence<8>>, + DeviceElementwiseImpl, Tuple, Normalize, 2, 64, 64, 64, 8, 8, ck::Sequence<1, 0>, Sequence<8, 1, 1, 8, 8>, Sequence<8>>, + DeviceElementwiseImpl, Tuple, Normalize, 2, 256, 64, 64, 4, 4, ck::Sequence<1, 0>, Sequence<4, 1, 1, 4, 4>, Sequence<4>>, + DeviceElementwiseImpl, Tuple, Normalize, 2, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<4, 1, 1, 4, 4>, Sequence<4>>, + DeviceElementwiseImpl, Tuple, Normalize, 2, 256, 64, 64, 4, 4, ck::Sequence<1, 0>, Sequence<2, 1, 1, 2, 2>, Sequence<2>>, + DeviceElementwiseImpl, Tuple, Normalize, 2, 64, 16, 16, 2, 2, ck::Sequence<1, 0>, Sequence<2, 1, 1, 2, 2>, Sequence<2>> // clang-format on >; using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_generic_instance = std::tuple< // clang-format off - DeviceElementwiseImpl, Tuple, Normalize, 2, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> > + DeviceElementwiseImpl, Tuple, Normalize, 2, 256, 64, 64, 4, 4, ck::Sequence<1, 0>, Sequence<1, 1, 1, 1, 1>, Sequence<1>>, + DeviceElementwiseImpl, Tuple, Normalize, 2, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, Sequence<1, 1, 1, 1, 1>, Sequence<1>> // clang-format on >; diff --git a/profiler/include/profiler/profile_transpose_impl.hpp b/profiler/include/profiler/profile_transpose_impl.hpp index a4f2cb6763..0baf2eac99 100644 --- a/profiler/include/profiler/profile_transpose_impl.hpp +++ b/profiler/include/profiler/profile_transpose_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -11,7 +11,7 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_elementwise.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp" #include "ck/library/tensor_operation_instance/gpu/transpose_3d.hpp" diff --git a/profiler/src/profile_transpose.cpp b/profiler/src/profile_transpose.cpp index d04c9fa2c4..5aa9b34a1c 100644 --- a/profiler/src/profile_transpose.cpp +++ b/profiler/src/profile_transpose.cpp @@ -18,39 +18,6 @@ enum struct DataType #define OP_NAME "transpose" #define OP_DESC "Transpose" -struct TransposeArgParser -{ - std::unordered_map> long_opts = {{"lengths", {}}}; - - bool parse_opt(const int argc, char* argv[], const std::string& key, int i) - { - if(std::string("--") + key == argv[i]) - { - const int pos = i; - while(++i < argc && argv[i][0] != '-') {} - int end = i; - for(int j = pos + 1; j < end; j++) - { - long_opts[key].push_back(std::stoi(argv[j])); - } - return true; - } - return false; - } - - void operator()(int argc, char* argv[]) - { - for(auto& kv : long_opts) - { - for(int i = 1; i < argc; i++) - { - if(parse_opt(argc, argv, kv.first, i)) - break; - } - } - } -}; - static void print_helper_msg() { printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"); @@ -59,25 +26,27 @@ static void print_helper_msg() printf("arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n"); printf("arg5: print tensor value (0: no; 1: yes)\n"); printf("arg6: time kernel (0=no, 1=yes)\n"); - printf("arg7: --lengths: N, C, D, H, W\n"); + printf("arg7 to arg11: N, C, D, H, W\n"); } int profile_transpose(int argc, char* argv[]) { - if(argc != 7) + if(argc != 12) { print_helper_msg(); exit(1); } - TransposeArgParser arg_parser; - const auto data_type = static_cast(std::stoi(argv[2])); - const bool do_verification = std::stoi(argv[3]); - const int init_method = std::stoi(argv[4]); - const bool do_log = std::stoi(argv[5]); - const bool time_kernel = std::stoi(argv[6]); - arg_parser(argc, argv); - const std::vector lengths = arg_parser.long_opts["lengths"]; + const auto data_type = static_cast(std::stoi(argv[2])); + const bool do_verification = std::stoi(argv[3]); + const int init_method = std::stoi(argv[4]); + const bool do_log = std::stoi(argv[5]); + const bool time_kernel = std::stoi(argv[6]); + const std::vector lengths = {std::stoi(argv[7]), + std::stoi(argv[8]), + std::stoi(argv[9]), + std::stoi(argv[10]), + std::stoi(argv[11])}; using F32 = float; using F16 = ck::half_t;