From 3f732cceab64a2d653f348686cd4c50114798bf6 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 31 Mar 2022 12:33:34 -0500 Subject: [PATCH] Compile for gfx908 and gfx90a (#130) * adding compilation for multiple targets * fix build * clean * update Jekinsfile * update readme * update Jenkins * use ck::half_t instead of ushort for bf16 * rename enum classes * clean * rename * clean [ROCm/composable_kernel commit: cd167e492a8f85ec6f5965e50667e8a58d3aa3a1] --- Jenkinsfile | 6 +- README.md | 44 ++ example/01_gemm/README.md | 39 +- example/01_gemm/gemm_xdl_fp16.cpp | 2 +- example/02_gemm_alpha_beta/README.md | 39 +- example/03_gemm_bias_relu/README.md | 39 +- example/04_gemm_bias_relu_add/README.md | 39 +- example/05_conv2d_fwd/README.md | 39 +- example/05_conv2d_fwd/conv2d_fwd_xdl_fp16.cpp | 2 +- example/05_conv2d_fwd/conv2d_fwd_xdl_int8.cpp | 2 +- example/06_conv2d_fwd_bias_relu/README.md | 47 +- .../conv2d_fwd_xdl_bias_relu.cpp | 4 +- example/07_conv2d_fwd_bias_relu_add/README.md | 45 +- .../conv2d_fwd_xdl_bias_relu_add.cpp | 2 +- example/08_conv3d_fwd/README.md | 51 +- example/08_conv3d_fwd/conv3d_fwd_xdl.cpp | 2 +- example/09_convnd_fwd/README.md | 39 +- example/09_convnd_fwd/convnd_fwd_xdl.cpp | 2 +- example/10_conv2d_bwd_data/README.md | 38 +- .../conv2d_bwd_data_xdl.cpp | 4 +- example/11_conv2d_bwd_wgt/README.md | 37 +- example/12_reduce/README.md | 41 +- example/12_reduce/reduce_blockwise.cpp | 16 +- example/13_pool2d_fwd/README.md | 39 +- example/13_pool2d_fwd/pool2d_fwd.cpp | 6 +- example/15_grouped_gemm/README.md | 37 +- .../15_grouped_gemm/grouped_gemm_xdl_fp16.cpp | 4 +- .../16_gemm_reduce/gemm_reduce_xdl_fp16.cpp | 2 +- example/17_convnd_bwd_data_xdl/README.md | 39 +- .../convnd_bwd_data_xdl.cpp | 4 +- .../batched_gemm_reduce_xdl_fp16.cpp | 2 +- include/ck/config.hpp | 165 +++--- include/ck/tensor/static_tensor.hpp | 8 +- .../gpu/block/blockwise_gemm_dlops_v2r2.hpp | 4 +- .../gpu/block/blockwise_gemm_dlops_v2r3.hpp | 4 +- .../gpu/block/blockwise_gemm_dlops_v3.hpp | 2 +- .../gpu/block/blockwise_gemm_xdlops.hpp | 6 +- .../blockwise_tensor_slice_transfer_v4r1.hpp | 2 +- .../blockwise_tensor_slice_transfer_v5r1.hpp | 2 +- .../blockwise_tensor_slice_transfer_v6r1.hpp | 2 +- .../blockwise_tensor_slice_transfer_v6r2.hpp | 2 +- .../blockwise_tensor_slice_transfer_v6r3.hpp | 2 +- ...nvolution_backward_data_specialization.hpp | 2 +- .../convolution_forward_specialization.hpp | 12 +- ...evice_batched_gemm_reduce_xdl_cshuffle.hpp | 50 +- .../gpu/device/device_batched_gemm_xdl.hpp | 2 +- ...rd_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp | 4 +- ...ice_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp | 8 +- ...fle_bias_activation_add_nhwc_kyxc_nhwk.hpp | 14 +- ...shuffle_bias_activation_nhwc_kyxc_nhwk.hpp | 14 +- ...onv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp | 14 +- .../device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp | 12 +- ...evice_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp | 6 +- ..._convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp | 14 +- .../device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp | 20 +- .../device_gemm_reduce_xdl_cshuffle.hpp | 50 +- .../gpu/device/device_gemm_xdl.hpp | 10 +- .../gpu/device/device_gemm_xdl_c_shuffle.hpp | 2 +- .../device_gemm_xdl_c_shuffle_bias_2d.hpp | 2 +- ...ice_gemm_xdl_c_shuffle_bias_activation.hpp | 2 +- ...gemm_xdl_c_shuffle_bias_activation_add.hpp | 2 +- .../gpu/device/device_gemm_xdl_cshuffle.hpp | 40 +- .../gpu/device/device_gemm_xdl_splitk.hpp | 12 +- .../device_gemm_xdl_splitk_c_shuffle.hpp | 12 +- .../gpu/device/device_grouped_gemm_xdl.hpp | 10 +- .../gpu/device/device_pool2d_fwd.hpp | 4 +- .../device/device_pool2d_fwd_nhwc_nhwc.hpp | 4 +- .../gpu/device/gemm_specialization.hpp | 2 +- .../gpu/device/reduction_operator_mapping.hpp | 32 +- .../grid/gridwise_2d_reduction_blockwise.hpp | 79 ++- ...ise_2d_reduction_multiblock_atomic_add.hpp | 15 +- ...2d_reduction_multiblock_partial_reduce.hpp | 49 +- .../grid/gridwise_2d_reduction_threadwise.hpp | 37 +- .../grid/gridwise_contraction_dlops_v1r2.hpp | 22 +- .../gpu/grid/gridwise_gemm_dlops_v1r2.hpp | 22 +- .../gpu/grid/gridwise_gemm_dlops_v1r3.hpp | 22 +- .../gpu/grid/gridwise_gemm_dlops_v2.hpp | 16 +- .../gpu/grid/gridwise_gemm_dlops_v3.hpp | 98 ++-- .../gridwise_gemm_reduce_xdl_cshuffle_v1.hpp | 32 +- .../grid/gridwise_gemm_xdl_cshuffle_v1.hpp | 20 +- .../gpu/grid/gridwise_gemm_xdlops_v2r3.hpp | 16 +- .../gpu/grid/gridwise_gemm_xdlops_v2r4.hpp | 16 +- .../gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp | 20 +- .../gpu/grid/gridwise_gemm_xdlops_v3r1.hpp | 20 +- .../gpu/grid/gridwise_gemm_xdlops_v3r2.hpp | 22 +- .../gpu/grid/gridwise_gemm_xdlops_v3r3.hpp | 24 +- .../gpu/grid/gridwise_set_buffer_value.hpp | 6 +- .../threadwise_tensor_slice_transfer.hpp | 14 +- .../threadwise_tensor_slice_transfer_v1r4.hpp | 523 ------------------ .../threadwise_tensor_slice_transfer_v1r5.hpp | 453 --------------- .../threadwise_tensor_slice_transfer_v3r1.hpp | 16 +- .../threadwise_tensor_slice_transfer_v3r3.hpp | 14 +- .../threadwise_tensor_slice_transfer_v5r1.hpp | 12 +- .../threadwise_tensor_slice_transfer_v6r1.hpp | 2 +- .../threadwise_tensor_slice_transfer_v6r2.hpp | 2 +- .../threadwise_tensor_slice_transfer_v6r3.hpp | 2 +- .../tensor_operation/gpu/warp/xdlops_gemm.hpp | 4 +- include/ck/utility/amd_address_space.hpp | 8 +- include/ck/utility/amd_buffer_addressing.hpp | 8 +- include/ck/utility/common_header.hpp | 21 +- include/ck/utility/data_type_enum.hpp | 2 +- include/ck/utility/data_type_enum_helper.hpp | 22 +- include/ck/utility/dynamic_buffer.hpp | 312 +++++------ .../ck/utility/{utility.hpp => get_id.hpp} | 6 +- include/ck/utility/multi_index.hpp | 2 +- include/ck/utility/reduction_enums.hpp | 8 +- include/ck/utility/static_buffer.hpp | 16 +- include/ck/utility/synchronization.hpp | 2 +- .../ck/library/host_tensor/conv_common.hpp | 8 +- .../ck/library/host_tensor/device_tensor.hpp | 1 - .../library/host_tensor/host_reduce_util.hpp | 86 +-- .../ck/library/host_tensor/host_reduction.hpp | 2 +- ...emm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp | 2 +- ...plicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp | 2 +- ...icit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp | 2 +- ..._gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp | 2 +- ...mm_v4r4r2_xdlops_atomic_nchw_kcyx_nkhw.hpp | 2 +- ...icit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp | 2 +- ...mm_v4r4r4_xdlops_atomic_nhwc_kyxc_nhwk.hpp | 2 +- ...icit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp | 2 +- ...mm_v4r4r5_xdlops_atomic_nhwc_kyxc_nhwk.hpp | 2 +- ...mplicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp | 2 +- ...licit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp | 2 +- ...icit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp | 2 +- ...icit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp | 2 +- ...emm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp | 2 +- ...mplicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp | 2 +- ...emm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp | 2 +- .../device_gemm_xdlops_km_kn_mn.hpp | 2 +- .../device_gemm_xdlops_km_kn_nm.hpp | 2 +- .../device_gemm_xdlops_km_nk_mn.hpp | 2 +- .../device_gemm_xdlops_km_nk_nm.hpp | 2 +- .../device_gemm_xdlops_mk_kn_mn.hpp | 2 +- .../device_gemm_xdlops_mk_kn_nm.hpp | 2 +- .../device_gemm_xdlops_mk_nk_mn.hpp | 2 +- .../device_gemm_xdlops_mk_nk_nm.hpp | 2 +- .../driver_contraction_dlops_v1r2.hpp | 2 +- ...emm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp | 4 +- ...emm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp | 4 +- ...emm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp | 4 +- .../driver_gemm_dlops_v1r2.hpp | 2 +- .../driver_gemm_dlops_v1r3.hpp | 2 +- .../driver_gemm_xdlops_v2r3.hpp | 2 +- .../driver_gemm_xdlops_v2r4.hpp | 2 +- .../device_reduce_instance_blockwise.hpp | 52 +- ..._reduce_instance_blockwise_second_call.hpp | 52 +- ..._reduce_instance_multiblock_atomic_add.hpp | 58 +- ...uce_instance_multiblock_partial_reduce.hpp | 52 +- .../device_reduce_instance_threadwise.hpp | 52 +- .../conv_add_fwd_driver_offline_nchwc.cpp | 6 +- .../conv_fwd_driver_offline_nchwc.cpp | 8 +- .../conv_maxpool_fwd_driver_offline_nchwc.cpp | 24 +- ...6_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp | 2 +- ...6_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp | 2 +- ...6_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp | 2 +- ...6_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp | 2 +- ...nv1d_fwd_xdl_nwc_kxc_nwk_bf16_instance.cpp | 6 +- ...onv1d_fwd_xdl_nwc_kxc_nwk_f16_instance.cpp | 6 +- ...onv1d_fwd_xdl_nwc_kxc_nwk_f32_instance.cpp | 6 +- ...nv1d_fwd_xdl_nwc_kxc_nwk_int8_instance.cpp | 6 +- ..._data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp | 4 +- ...d_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp | 4 +- ...d_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp | 4 +- ..._data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp | 4 +- ..._c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp | 8 +- ...d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp | 6 +- ...2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp | 6 +- ...2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp | 6 +- ...d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp | 6 +- ..._bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp | 10 +- ...s_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp | 8 +- ...atomic_add_nhwc_kyxc_nhwk_f16_instance.cpp | 4 +- ...wd_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp | 6 +- ...fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp | 6 +- ...fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp | 6 +- ...wd_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp | 6 +- ...bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp | 6 +- ..._bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp | 4 +- ..._bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp | 4 +- ...bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp | 4 +- ..._data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp | 6 +- ...d_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp | 4 +- ...d_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp | 4 +- ..._data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp | 4 +- ...ta_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp | 6 +- ...ata_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp | 4 +- ...ata_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp | 4 +- ...ta_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp | 4 +- ...gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp | 2 +- ...gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp | 2 +- ...gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp | 2 +- ...gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp | 4 +- ...gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp | 2 +- ...gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp | 2 +- ...gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp | 2 +- ...gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp | 2 +- ...l_splitk_f16_f16_f16_km_kn_mn_instance.cpp | 2 +- ...l_splitk_f16_f16_f16_km_nk_mn_instance.cpp | 2 +- ...l_splitk_f16_f16_f16_mk_kn_mn_instance.cpp | 2 +- ...l_splitk_f16_f16_f16_mk_nk_mn_instance.cpp | 2 +- ...l_splitk_f32_f32_f32_km_kn_mn_instance.cpp | 2 +- ...l_splitk_f32_f32_f32_km_nk_mn_instance.cpp | 2 +- ...l_splitk_f32_f32_f32_mk_kn_mn_instance.cpp | 2 +- ...l_splitk_f32_f32_f32_mk_nk_mn_instance.cpp | 2 +- ..._f16_f16_f16_f32_f32_km_kn_mn_instance.cpp | 2 +- ..._f16_f16_f16_f32_f32_km_nk_mn_instance.cpp | 2 +- ..._f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp | 2 +- ..._f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp | 2 +- ...gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp | 2 +- ...gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp | 2 +- ...gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp | 2 +- ...gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp | 4 +- profiler/README.md | 48 ++ .../include/profile_convnd_bwd_data_impl.hpp | 2 +- profiler/include/profile_reduce_impl.hpp | 40 +- profiler/src/README.md | 81 --- profiler/src/profile_batched_gemm_reduce.cpp | 23 +- profiler/src/profile_convnd_bwd_data.cpp | 16 +- profiler/src/profile_gemm_reduce.cpp | 23 +- profiler/src/profile_grouped_gemm.cpp | 8 +- profiler/src/profile_reduce.cpp | 78 +-- profiler/src/profiler.cpp | 40 +- script/cmake-rocm.sh | 4 +- test/include/conv_test_util.hpp | 2 +- .../magic_number_division.cpp | 2 +- test/reduce/reduce_no_index.cpp | 10 +- test/reduce/reduce_with_index.cpp | 10 +- 227 files changed, 1398 insertions(+), 2944 deletions(-) delete mode 100644 include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v1r4.hpp delete mode 100644 include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v1r5.hpp rename include/ck/utility/{utility.hpp => get_id.hpp} (88%) create mode 100644 profiler/README.md delete mode 100644 profiler/src/README.md diff --git a/Jenkinsfile b/Jenkinsfile index 1aaaf932c1..76fb68b881 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -182,7 +182,7 @@ pipeline { { agent { label rocmnode("nogpu")} environment{ - setup_args = """ -D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " -DBUILD_DEV=On """ + setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """ } steps{ buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release') @@ -192,7 +192,7 @@ pipeline { { agent { label rocmnode("nogpu")} environment{ - setup_args = """ -D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " -DBUILD_DEV=On """ + setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """ } steps{ // until we stabilize debug build due to compiler crashes @@ -228,7 +228,7 @@ pipeline { { agent{ label rocmnode("gfx908")} environment{ - setup_args = """ -D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " -DBUILD_DEV=On """ + setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """ } steps{ buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release') diff --git a/README.md b/README.md index 8b13789179..4011d34415 100644 --- a/README.md +++ b/README.md @@ -1 +1,45 @@ +## Docker script +```bash +docker run \ +-it \ +--privileged \ +--group-add sudo \ +-w /root/workspace \ +-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ +rocm/tensorflow:rocm4.3.1-tf2.6-dev \ +/bin/bash +``` +## Build +```bash +mkdir build && cd build +``` + +```bash +# Need to specify target ID, example below is gfx908 and gfx90a +cmake \ +-D BUILD_DEV=OFF \ +-D CMAKE_BUILD_TYPE=Release \ +-D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 --offload-arch=gfx90a -O3 \ +-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ +-D CMAKE_PREFIX_PATH=/opt/rocm \ +.. +``` + +### Build and Run Examples +```bash + make -j examples +``` +Instructions for running each individual examples are under ```example/``` + +## Tests +```bash + make -j tests + make test +``` + +## Build ckProfiler +```bash + make -j ckProfiler +``` +Instructions for running ckProfiler are under ```profiler/``` diff --git a/example/01_gemm/README.md b/example/01_gemm/README.md index d8c388117f..226783b03b 100644 --- a/example/01_gemm/README.md +++ b/example/01_gemm/README.md @@ -1,44 +1,11 @@ -# Instructions for ```gemm_xdl``` Example +# Instructions for ```example_gemm_xdl``` -## Docker script -```bash -docker run \ --it \ ---rm \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ -rocm/tensorflow:rocm4.3.1-tf2.6-dev \ -/bin/bash -``` - -## Build ```gemm_xdl``` -```bash -mkdir build && cd build -``` - -```bash -# Need to specify target ID, example below is gfx908 -cmake \ --D BUILD_DEV=OFF \ --D CMAKE_BUILD_TYPE=Release \ --D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \ --D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_PREFIX_PATH=/opt/rocm \ -.. -``` - -```bash - make -j gemm_xdl -``` - -## Run ```gemm_xdl``` +## Run ```example_gemm_xdl``` ```bash #arg1: verification (0=no, 1=yes) #arg2: initialization (0=no init, 1=integer value, 2=decimal value) #arg3: run kernel # of times (>1) -./example/gemm_xdl 0 1 5 +./bin/example_gemm_xdl 0 1 5 ``` Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16) diff --git a/example/01_gemm/gemm_xdl_fp16.cpp b/example/01_gemm/gemm_xdl_fp16.cpp index 5be6deb850..8d6b6adaa8 100644 --- a/example/01_gemm/gemm_xdl_fp16.cpp +++ b/example/01_gemm/gemm_xdl_fp16.cpp @@ -40,7 +40,7 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CElementOp = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // clang-format off using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle diff --git a/example/02_gemm_alpha_beta/README.md b/example/02_gemm_alpha_beta/README.md index a3dc4a75fc..ba2a3068f3 100644 --- a/example/02_gemm_alpha_beta/README.md +++ b/example/02_gemm_alpha_beta/README.md @@ -1,44 +1,11 @@ -# Instructions for ```gemm_xdl_alpha_beta``` Example +# Instructions for ```example_gemm_xdl_alpha_beta``` -## Docker script -```bash -docker run \ --it \ ---rm \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ -rocm/tensorflow:rocm4.3.1-tf2.6-dev \ -/bin/bash -``` - -## Build ```gemm_xdl_alpha_beta``` -```bash -mkdir build && cd build -``` - -```bash -# Need to specify target ID, example below is gfx908 -cmake \ --D BUILD_DEV=OFF \ --D CMAKE_BUILD_TYPE=Release \ --D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \ --D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_PREFIX_PATH=/opt/rocm \ -.. -``` - -```bash - make -j gemm_xdl_alpha_beta -``` - -## Run ```gemm_xdl_alpha_beta``` +## Run ```example_gemm_xdl_alpha_beta``` ```bash #arg1: verification (0=no, 1=yes) #arg2: initialization (0=no init, 1=integer value, 2=decimal value) #arg3: run kernel # of times (>1) -./example/gemm_xdl_alpha_beta 1 1 1 0.5 0.5 +./bin/example_gemm_xdl_alpha_beta 1 1 1 0.5 0.5 ``` Result (MI100 @ 1502Mhz, 184.6TFlops peak FP16) ``` diff --git a/example/03_gemm_bias_relu/README.md b/example/03_gemm_bias_relu/README.md index 379f9a2e75..f8d9bd6152 100644 --- a/example/03_gemm_bias_relu/README.md +++ b/example/03_gemm_bias_relu/README.md @@ -1,45 +1,12 @@ -# Instructions for ```gemm_xdl_bias_relu_add``` Example +# Instructions for ```example_gemm_xdl_bias_relu_add``` -## Docker script -```bash -docker run \ --it \ ---rm \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ -rocm/tensorflow:rocm4.3.1-tf2.6-dev \ -/bin/bash -``` - -## Build ```gemm_xdl_bias_relu_add``` -```bash -mkdir build && cd build -``` - -```bash -# Need to specify target ID, example below is gfx908 -cmake \ --D BUILD_DEV=OFF \ --D CMAKE_BUILD_TYPE=Release \ --D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \ --D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_PREFIX_PATH=/opt/rocm \ -.. -``` - -```bash - make -j gemm_xdl_bias_relu_add -``` - -## Run ```gemm_xdl_bias_relu_add``` +## Run ```example_gemm_xdl_bias_relu_add``` ```bash #arg1: verification (0=no, 1=yes) #arg2: initialization (0=no init, 1=integer value, 2=decimal value) #arg3: run kernel # of times (>1) #arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC -./example/gemm_xdl_bias_relu_add 0 1 5 3840 4096 4096 4096 4096 4096 +./bin/example_gemm_xdl_bias_relu_add 0 1 5 3840 4096 4096 4096 4096 4096 ``` Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16) diff --git a/example/04_gemm_bias_relu_add/README.md b/example/04_gemm_bias_relu_add/README.md index 379f9a2e75..f8d9bd6152 100644 --- a/example/04_gemm_bias_relu_add/README.md +++ b/example/04_gemm_bias_relu_add/README.md @@ -1,45 +1,12 @@ -# Instructions for ```gemm_xdl_bias_relu_add``` Example +# Instructions for ```example_gemm_xdl_bias_relu_add``` -## Docker script -```bash -docker run \ --it \ ---rm \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ -rocm/tensorflow:rocm4.3.1-tf2.6-dev \ -/bin/bash -``` - -## Build ```gemm_xdl_bias_relu_add``` -```bash -mkdir build && cd build -``` - -```bash -# Need to specify target ID, example below is gfx908 -cmake \ --D BUILD_DEV=OFF \ --D CMAKE_BUILD_TYPE=Release \ --D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \ --D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_PREFIX_PATH=/opt/rocm \ -.. -``` - -```bash - make -j gemm_xdl_bias_relu_add -``` - -## Run ```gemm_xdl_bias_relu_add``` +## Run ```example_gemm_xdl_bias_relu_add``` ```bash #arg1: verification (0=no, 1=yes) #arg2: initialization (0=no init, 1=integer value, 2=decimal value) #arg3: run kernel # of times (>1) #arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC -./example/gemm_xdl_bias_relu_add 0 1 5 3840 4096 4096 4096 4096 4096 +./bin/example_gemm_xdl_bias_relu_add 0 1 5 3840 4096 4096 4096 4096 4096 ``` Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16) diff --git a/example/05_conv2d_fwd/README.md b/example/05_conv2d_fwd/README.md index 4114571afe..08a7f0d56c 100644 --- a/example/05_conv2d_fwd/README.md +++ b/example/05_conv2d_fwd/README.md @@ -1,45 +1,12 @@ -# Instructions for ```conv2d_fwd_xdl``` Example +# Instructions for ```example_conv2d_fwd_xdl``` -## Docker script -```bash -docker run \ --it \ ---rm \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ -rocm/tensorflow:rocm4.3.1-tf2.6-dev \ -/bin/bash -``` - -## Build ```conv2d_fwd_xdl``` -```bash -mkdir build && cd build -``` - -```bash -# Need to specify target ID, example below is gfx908 -cmake \ --D BUILD_DEV=OFF \ --D CMAKE_BUILD_TYPE=Release \ --D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \ --D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_PREFIX_PATH=/opt/rocm \ -.. -``` - -```bash - make -j conv2d_fwd_xdl -``` - -## Run ```conv2d_fwd_xdl``` +## Run ```example_conv2d_fwd_xdl``` ```bash #arg1: verification (0=no, 1=yes) #arg2: initialization (0=no init, 1=integer value, 2=decimal value) #arg3: run kernel # of times (>1) #arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx -./example/conv2d_fwd_xdl 0 1 5 +./bin/example_conv2d_fwd_xdl 0 1 5 ``` Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16) diff --git a/example/05_conv2d_fwd/conv2d_fwd_xdl_fp16.cpp b/example/05_conv2d_fwd/conv2d_fwd_xdl_fp16.cpp index 4f255fda9d..c1f5c3b169 100644 --- a/example/05_conv2d_fwd/conv2d_fwd_xdl_fp16.cpp +++ b/example/05_conv2d_fwd/conv2d_fwd_xdl_fp16.cpp @@ -34,7 +34,7 @@ using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; using DeviceConvFwdInstance = ck::tensor_operation::device:: DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< diff --git a/example/05_conv2d_fwd/conv2d_fwd_xdl_int8.cpp b/example/05_conv2d_fwd/conv2d_fwd_xdl_int8.cpp index 8614f53472..ea5e7a1fd9 100644 --- a/example/05_conv2d_fwd/conv2d_fwd_xdl_int8.cpp +++ b/example/05_conv2d_fwd/conv2d_fwd_xdl_int8.cpp @@ -35,7 +35,7 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; using DeviceConvFwdInstance = ck::tensor_operation::device:: DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< diff --git a/example/06_conv2d_fwd_bias_relu/README.md b/example/06_conv2d_fwd_bias_relu/README.md index eed5605a9e..4c30563ef0 100644 --- a/example/06_conv2d_fwd_bias_relu/README.md +++ b/example/06_conv2d_fwd_bias_relu/README.md @@ -1,45 +1,12 @@ -# Instructions for ```conv_xdl_bias_relu_add``` Example +# Instructions for ```example_conv_xdl_bias_relu``` -## Docker script -```bash -docker run \ --it \ ---rm \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ -rocm/tensorflow:rocm4.3.1-tf2.6-dev \ -/bin/bash -``` - -## Build ```conv_xdl_bias_relu_add``` -```bash -mkdir build && cd build -``` - -```bash -# Need to specify target ID, example below is gfx908 -cmake \ --D BUILD_DEV=OFF \ --D CMAKE_BUILD_TYPE=Release \ --D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \ --D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_PREFIX_PATH=/opt/rocm \ -.. -``` - -```bash - make -j conv_xdl_bias_relu_add -``` - -## Run ```conv_xdl_bias_relu_add``` +## Run ```example_conv_xdl_bias_relu``` ```bash #arg1: verification (0=no, 1=yes) #arg2: initialization (0=no init, 1=integer value, 2=decimal value) #arg3: run kernel # of times (>1) #arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx -./example/conv_xdl_bias_relu_add 0 1 5 +./bin/example_conv_xdl_bias_relu 0 1 5 ``` Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16) @@ -48,14 +15,8 @@ in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192} wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192} out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256} bias_k: dim 1, lengths {256}, strides {1} -resi_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256} -arg.a_grid_desc_k0_m_k1_{216, 165888, 8} -arg.b_grid_desc_k0_n_k1_{216, 256, 8} -arg.c_grid_desc_m_n_{ 165888, 256} -arg.c0_grid_desc_m_n_{ 165888, 256} -arg.c1_grid_desc_m_n_{ 165888, 256} launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1} Warm up Start running 5 times... -Perf: 1.71779 ms, 85.4396 TFlops, 194.2 GB/s +Perf: 1.39009 ms, 105.581 TFlops, 239.981 GB/s ``` diff --git a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp index d251aa35e1..0b3e15a25e 100644 --- a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp +++ b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp @@ -32,10 +32,10 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::AddRelu; -static constexpr auto MemorySet = ck::InMemoryDataOperationEnum_t::Set; +static constexpr auto MemorySet = ck::InMemoryDataOperationEnum::Set; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; // clang-format off using DeviceConvFwdInstance = ck::tensor_operation::device:: diff --git a/example/07_conv2d_fwd_bias_relu_add/README.md b/example/07_conv2d_fwd_bias_relu_add/README.md index eed5605a9e..99afcae9c8 100644 --- a/example/07_conv2d_fwd_bias_relu_add/README.md +++ b/example/07_conv2d_fwd_bias_relu_add/README.md @@ -1,45 +1,13 @@ -# Instructions for ```conv_xdl_bias_relu_add``` Example +# Instructions for ```example_conv_xdl_bias_relu_add``` -## Docker script -```bash -docker run \ --it \ ---rm \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ -rocm/tensorflow:rocm4.3.1-tf2.6-dev \ -/bin/bash -``` -## Build ```conv_xdl_bias_relu_add``` -```bash -mkdir build && cd build -``` - -```bash -# Need to specify target ID, example below is gfx908 -cmake \ --D BUILD_DEV=OFF \ --D CMAKE_BUILD_TYPE=Release \ --D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \ --D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_PREFIX_PATH=/opt/rocm \ -.. -``` - -```bash - make -j conv_xdl_bias_relu_add -``` - -## Run ```conv_xdl_bias_relu_add``` +## Run ```example_conv_xdl_bias_relu_add``` ```bash #arg1: verification (0=no, 1=yes) #arg2: initialization (0=no init, 1=integer value, 2=decimal value) #arg3: run kernel # of times (>1) #arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx -./example/conv_xdl_bias_relu_add 0 1 5 +./bin/example_conv_xdl_bias_relu_add 0 1 5 ``` Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16) @@ -49,13 +17,8 @@ wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192} out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256} bias_k: dim 1, lengths {256}, strides {1} resi_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256} -arg.a_grid_desc_k0_m_k1_{216, 165888, 8} -arg.b_grid_desc_k0_n_k1_{216, 256, 8} -arg.c_grid_desc_m_n_{ 165888, 256} -arg.c0_grid_desc_m_n_{ 165888, 256} -arg.c1_grid_desc_m_n_{ 165888, 256} launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1} Warm up Start running 5 times... -Perf: 1.71779 ms, 85.4396 TFlops, 194.2 GB/s +Perf: 1.44711 ms, 101.421 TFlops, 289.218 GB/s ``` diff --git a/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp b/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp index d6011b98a9..bcfde547b2 100644 --- a/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp +++ b/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp @@ -33,7 +33,7 @@ using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; // clang-format off using DeviceConvFwdInstance = ck::tensor_operation::device:: diff --git a/example/08_conv3d_fwd/README.md b/example/08_conv3d_fwd/README.md index 06339b74e5..962c603871 100644 --- a/example/08_conv3d_fwd/README.md +++ b/example/08_conv3d_fwd/README.md @@ -1,57 +1,24 @@ -# Instructions for ```conv3d_fwd_xdl``` Example +# Instructions for ```example_conv3d_fwd_xdl``` -## Docker script -```bash -docker run \ --it \ ---rm \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ -rocm/tensorflow:rocm4.3.1-tf2.6-dev \ -/bin/bash -``` - -## Build ```conv3d_fwd_xdl``` -```bash -mkdir build && cd build -``` - -```bash -# Need to specify target ID, example below is gfx908 -cmake \ --D BUILD_DEV=OFF \ --D CMAKE_BUILD_TYPE=Release \ --D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \ --D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_PREFIX_PATH=/opt/rocm \ -.. -``` - -```bash - make -j conv3d_fwd_xdl -``` - -## Run ```conv3d_fwd_xdl``` +## Run ```example_conv3d_fwd_xdl``` ```bash #arg1: verification (0=no, 1=yes) #arg2: initialization (0=no init, 1=integer value, 2=decimal value) #arg3: run kernel # of times (>1) #arg4 to 24: N, K, C, Z, Y, X, Di, Hi, Wi, Sz, Sy, Sx, Dz, Dy, Dx, leftPz, LeftPy, LeftPx, RightPz, RightPy, RightPx -./example/conv3d_fwd_xdl 0 1 5 +./bin/example_conv3d_fwd_xdl 0 1 5 ``` -Result (MI100 dynamic frequency) +Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16) ``` -in: dim 5, lengths {4, 71, 71, 71, 192}, strides {68718912, 967872, 13632, 192, 1} wei: dim 5, lengths {256, 3, 3, 3, 192}, strides {5184, 1728, 576, 192, 1} out: dim 5, lengths {4, 36, 36, 36, 256}, strides {11943936, 331776, 9216, 256, 1} -a_grid_desc_b_k0_m_k1{1, 648, 186624, 8} -b_grid_desc_b_k0_n_k1{1, 648, 256, 8} +num_batches_of_GEMM = 1 +a_grid_desc_k0_m_k1{648, 186624, 8} +b_grid_desc_k0_n_k1{648, 256, 8} +c_grid_desc_m_n{ 186624, 256} launch_and_time_kernel: grid_dim {1458, 1, 1}, block_dim {256, 1, 1} Warm up Start running 5 times... -Perf: 4.49466 ms, 110.206 TFlops, 144.161 GB/s +Perf: 4.58795 ms, 107.965 TFlops, 141.23 GB/s ``` - diff --git a/example/08_conv3d_fwd/conv3d_fwd_xdl.cpp b/example/08_conv3d_fwd/conv3d_fwd_xdl.cpp index 89d2933619..5f89ee3c19 100644 --- a/example/08_conv3d_fwd/conv3d_fwd_xdl.cpp +++ b/example/08_conv3d_fwd/conv3d_fwd_xdl.cpp @@ -37,7 +37,7 @@ using WeiLayout = ck::tensor_layout::convolution::KZYXC; using OutLayout = ck::tensor_layout::convolution::NDHWK; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; using DeviceConv3dFwdInstance = ck::tensor_operation::device:: DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K< diff --git a/example/09_convnd_fwd/README.md b/example/09_convnd_fwd/README.md index d85a409165..9ab5fee549 100644 --- a/example/09_convnd_fwd/README.md +++ b/example/09_convnd_fwd/README.md @@ -1,39 +1,6 @@ -# Instructions for ```convnd_fwd_xdl``` Example +# Instructions for ```example_convnd_fwd_xdl``` -## Docker script -```bash -docker run \ --it \ ---rm \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ -rocm/tensorflow:rocm4.3.1-tf2.6-dev \ -/bin/bash -``` - -## Build ```convnd_fwd_xdl``` -```bash -mkdir build && cd build -``` - -```bash -# Need to specify target ID, example below is gfx908 -cmake \ --D BUILD_DEV=OFF \ --D CMAKE_BUILD_TYPE=Release \ --D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \ --D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_PREFIX_PATH=/opt/rocm \ -.. -``` - -```bash - make -j convnd_fwd_xdl -``` - -## Run ```convnd_fwd_xdl``` +## Run ```example_convnd_fwd_xdl``` ```bash #arg1: verification (0=no, 1=yes) #arg2: initialization (0=no init, 1=integer value, 2=decimal value) @@ -47,7 +14,7 @@ cmake \ # , (ie Dy, Dx for 2D) # , (ie LeftPy, LeftPx for 2D) # , (ie RightPy, RightPx for 2D) -./example/convnd_fwd_xdl 0 1 100 +./bin/example_convnd_fwd_xdl 0 1 100 ``` Result (MI100 @ 1087Mhz, 33.4TFlops peak FP32) diff --git a/example/09_convnd_fwd/convnd_fwd_xdl.cpp b/example/09_convnd_fwd/convnd_fwd_xdl.cpp index d26a52b2fd..3caaf6720c 100644 --- a/example/09_convnd_fwd/convnd_fwd_xdl.cpp +++ b/example/09_convnd_fwd/convnd_fwd_xdl.cpp @@ -26,7 +26,7 @@ using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; using DeviceConvFwdBasePtr = ck::tensor_operation::device::DeviceConvFwdPtr; diff --git a/example/10_conv2d_bwd_data/README.md b/example/10_conv2d_bwd_data/README.md index 547c544445..7503ff6d1e 100644 --- a/example/10_conv2d_bwd_data/README.md +++ b/example/10_conv2d_bwd_data/README.md @@ -1,45 +1,13 @@ -# Instructions for ```conv2d_bwd_data_xdl``` Example +# Instructions for ```example_conv2d_bwd_data_xdl``` Example -## Docker script -```bash -docker run \ --it \ ---rm \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ -rocm/tensorflow:rocm4.3.1-tf2.6-dev \ -/bin/bash -``` -## Build ```conv2d_bwd_data_xdl``` -```bash -mkdir build && cd build -``` - -```bash -# Need to specify target ID, example below is gfx908 -cmake \ --D BUILD_DEV=OFF \ --D CMAKE_BUILD_TYPE=Release \ --D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \ --D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_PREFIX_PATH=/opt/rocm \ -.. -``` - -```bash - make -j conv2d_bwd_data_xdl -``` - -## Run ```conv2d_bwd_data_xdl``` +## Run ```example_conv2d_bwd_data_xdl``` ```bash #arg1: verification (0=no, 1=yes) #arg2: initialization (0=no init, 1=integer value, 2=decimal value) #arg3: run kernel # of times (>1) #arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx -./bin/conv2d_bwd_data_xdl 0 1 5 +./bin/example_conv2d_bwd_data_xdl 0 1 5 ``` Result diff --git a/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp b/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp index ee8eaf2209..8307157cec 100644 --- a/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp +++ b/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp @@ -27,7 +27,7 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvBwdDefault = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default; using DeviceConvBwdDataInstance = ck::tensor_operation::device:: DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< @@ -38,7 +38,7 @@ using DeviceConvBwdDataInstance = ck::tensor_operation::device:: InElementOp, // InElementwiseOperation WeiElementOp, // WeiElementwiseOperation OutElementOp, // OutElementwiseOperation - ConvBwdDefault, // ConvolutionBackwardDataSpecialization_t + ConvBwdDefault, // ConvolutionBackwardDataSpecialization 256, // BlockSize 128, // MPerBlock 128, // NPerBlock diff --git a/example/11_conv2d_bwd_wgt/README.md b/example/11_conv2d_bwd_wgt/README.md index 16e9bbc455..39ba140d45 100644 --- a/example/11_conv2d_bwd_wgt/README.md +++ b/example/11_conv2d_bwd_wgt/README.md @@ -1,39 +1,6 @@ -# Instructions for ```conv2d_wrw_xdl``` Example +# Instructions for ```example_conv2d_wrw_xdl``` Example -## Docker script -```bash -docker run \ --it \ ---rm \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ -rocm/tensorflow:rocm4.3.1-tf2.6-dev \ -/bin/bash -``` - -## Build ```conv2d_wrw_xdl``` -```bash -mkdir build && cd build -``` - -```bash -# Need to specify target ID, example below is gfx908 -cmake \ --D BUILD_DEV=OFF \ --D CMAKE_BUILD_TYPE=Release \ --D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \ --D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_PREFIX_PATH=/opt/rocm \ -.. -``` - -```bash - make -j conv2d_wrw_xdl -``` - -## Run ```conv2d_wrw_xdl``` +## Run ```example_conv2d_wrw_xdl``` ```bash #arg1: verification (0=no, 1=yes) #arg2: initialization (0=no init, 1=integer value, 2=decimal value) diff --git a/example/12_reduce/README.md b/example/12_reduce/README.md index 20e1b5aa6a..6fd3b3dcf3 100644 --- a/example/12_reduce/README.md +++ b/example/12_reduce/README.md @@ -1,45 +1,12 @@ -# Instructions for ```reduce_blockwise``` Example +# Instructions for ```example_reduce_blockwise``` -## Docker script -```bash -docker run \ --it \ ---rm \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ -rocm/tensorflow:rocm4.3.1-tf2.6-dev \ -/bin/bash -``` - -## Build ```reduce_blockwise``` -```bash -mkdir build && cd build -``` - -```bash -# Need to specify target ID, example below is gfx908 -cmake \ --D BUILD_DEV=OFF \ --D CMAKE_BUILD_TYPE=Release \ --D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \ --D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_PREFIX_PATH=/opt/rocm \ -.. -``` - -```bash - make -j reduce_blockwise -``` - -## Run ```reduce_blockwise``` +## Run ```example_reduce_blockwise``` ```bash # -D : input 4-d tensor lengths # -v : verification (0=no, 1=yes) #arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value) #arg2: run kernel # of times (>1) -./bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10 +./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10 ``` Result @@ -50,7 +17,7 @@ Start running 3 times... Perf: 0.23536 ms, 267.32 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1> error: 0 max_diff: 0, 529, 529 -root@dc-smc-18:/data/composable_kernel/Build3# bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10 +root@dc-smc-18:/data/composable_kernel/Build3# bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10 launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1} Warm up Start running 10 times... diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp index b97799203b..41962ac43d 100644 --- a/example/12_reduce/reduce_blockwise.cpp +++ b/example/12_reduce/reduce_blockwise.cpp @@ -32,10 +32,10 @@ using HostAccDataType = float; constexpr int Rank = 4; constexpr int NumReduceDim = 3; -constexpr ReduceTensorOp_t ReduceOpId = ReduceTensorOp_t::NORM2; -constexpr NanPropagation_t NanOpt = NanPropagation_t::PROPAGATE_NAN; -constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; -constexpr ReduceTensorIndices_t IndicesOpt = ReduceTensorIndices_t::NO_INDICES; +constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2; +constexpr NanPropagation NanOpt = NanPropagation::PROPAGATE_NAN; +constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true; +constexpr ReduceTensorIndices IndicesOpt = ReduceTensorIndices::NO_INDICES; using ReduceOperation = typename reduce_binary_operator::opType; using InElementwiseOperation = @@ -210,11 +210,11 @@ int main(int argc, char* argv[]) return (-1); constexpr bool op_support_indices = - (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || - ReduceOpId == ReduceTensorOp_t::AMAX); + (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || + ReduceOpId == ReduceTensorOp::AMAX); constexpr bool NeedIndices = - (op_support_indices && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES)); + (op_support_indices && (IndicesOpt != ReduceTensorIndices::NO_INDICES)); // if input is half type, no reason to use float for indiced reduction operation and must use // float for non-indiced reduction operation for accuracy @@ -230,7 +230,7 @@ int main(int argc, char* argv[]) // indices option can only be used when it is really needed constexpr bool invalid_reduce_3 = - (!op_support_indices && IndicesOpt != ReduceTensorIndices_t::NO_INDICES); + (!op_support_indices && IndicesOpt != ReduceTensorIndices::NO_INDICES); constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3); diff --git a/example/13_pool2d_fwd/README.md b/example/13_pool2d_fwd/README.md index 4b994e7382..d9c829fb98 100644 --- a/example/13_pool2d_fwd/README.md +++ b/example/13_pool2d_fwd/README.md @@ -1,45 +1,12 @@ -# Instructions for ```pool2d_fwd``` Example +# Instructions for ```example_pool2d_fwd``` Example -## Docker script -```bash -docker run \ --it \ ---rm \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ -rocm/tensorflow:rocm4.3.1-tf2.6-dev \ -/bin/bash -``` - -## Build ```pool2d_fwd``` -```bash -mkdir build && cd build -``` - -```bash -# Need to specify target ID, example below is gfx908 -cmake \ --D BUILD_DEV=OFF \ --D CMAKE_BUILD_TYPE=Release \ --D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \ --D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_PREFIX_PATH=/opt/rocm \ -.. -``` - -```bash - make -j pool2d_fwd -``` - -## Run ```pool2d_fwd``` +## Run ```example_pool2d_fwd``` ```bash #arg1: verification (0=no, 1=yes) #arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value) #arg3: run kernel # of times (>1) #arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx -./example/pool2d_fwd 1 1 10 +./bin/example_pool2d_fwd 1 1 10 ``` Result diff --git a/example/13_pool2d_fwd/pool2d_fwd.cpp b/example/13_pool2d_fwd/pool2d_fwd.cpp index 0b4aba3af1..6c16ed57d0 100644 --- a/example/13_pool2d_fwd/pool2d_fwd.cpp +++ b/example/13_pool2d_fwd/pool2d_fwd.cpp @@ -22,9 +22,9 @@ using InLayout = ck::tensor_layout::convolution::NHWC; using OutLayout = ck::tensor_layout::convolution::NHWC; #if 1 -static constexpr auto ReduceOpId = ck::ReduceTensorOp_t::MAX; +static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; #else -static constexpr auto ReduceOpId = ck::ReduceTensorOp_t::AVG; +static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; #endif static constexpr bool NeedIndices = false; @@ -47,7 +47,7 @@ using DevicePoolFwdInstance = template static void pool_host_verify(const Tensor& in, diff --git a/example/15_grouped_gemm/README.md b/example/15_grouped_gemm/README.md index b8245dc05a..c83b23e08c 100644 --- a/example/15_grouped_gemm/README.md +++ b/example/15_grouped_gemm/README.md @@ -1,39 +1,6 @@ -# Instructions for ```grouped_gemm_xdl``` Example +# Instructions for ```example_grouped_gemm_xdl``` -## Docker script -```bash -docker run \ --it \ ---rm \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ -rocm/tensorflow:rocm4.3.1-tf2.6-dev \ -/bin/bash -``` - -## Build ```grouped_gemm_xdl``` -```bash -mkdir build && cd build -``` - -```bash -# Need to specify target ID, example below is gfx908 -cmake \ --D BUILD_DEV=OFF \ --D CMAKE_BUILD_TYPE=Release \ --D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \ --D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_PREFIX_PATH=/opt/rocm \ -.. -``` - -```bash - make -j example_grouped_gemm_xdl_fp16 -``` - -## Run ```grouped_gemm_xdl``` +## Run ```example_grouped_gemm_xdl``` ```bash #arg1: verification (0=no, 1=yes) #arg2: initialization (0=no init, 1=integer value, 2=decimal value) diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp index 7c23a2f468..bfad477163 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp @@ -40,9 +40,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CElementOp = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // static constexpr auto GemmMNPadding = -// ck::tensor_operation::device::GemmSpecialization_t::MNPadding; +// ck::tensor_operation::device::GemmSpecialization::MNPadding; // clang-format off using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemmXdl diff --git a/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp b/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp index 673dce82db..0346075c36 100644 --- a/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp +++ b/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp @@ -40,7 +40,7 @@ using D0ReduceOp = ck::tensor_operation::element_wise::ReduceSum; using D1ReduceOp = ck::tensor_operation::element_wise::ReduceSquareSum; static constexpr auto GemmSpecialization = - ck::tensor_operation::device::GemmSpecialization_t::Default; + ck::tensor_operation::device::GemmSpecialization::Default; // clang-format off using DeviceGemmReduceInstance = ck::tensor_operation::device::DeviceGemmReduce_Xdl_CShuffle diff --git a/example/17_convnd_bwd_data_xdl/README.md b/example/17_convnd_bwd_data_xdl/README.md index ac625d1716..b5c8281ed8 100644 --- a/example/17_convnd_bwd_data_xdl/README.md +++ b/example/17_convnd_bwd_data_xdl/README.md @@ -1,46 +1,13 @@ -# Instructions for ```convnd_bwd_data_xdl``` Example +# Instructions for ```example_convnd_bwd_data_xdl``` -## Docker script -```bash -docker run \ --it \ ---rm \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ -rocm/tensorflow:rocm4.3.1-tf2.6-dev \ -/bin/bash -``` - -## Build ```convnd_bwd_data_xdl``` -```bash -mkdir build && cd build -``` - -```bash -# Need to specify target ID, example below is gfx908 -cmake \ --D BUILD_DEV=OFF \ --D CMAKE_BUILD_TYPE=Release \ --D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \ --D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_PREFIX_PATH=/opt/rocm \ -.. -``` - -```bash - make -j convnd_bwd_data_xdl -``` - -## Run ```example_convnd_bwd_data_xdl``` +## Run ```example_example_convnd_bwd_data_xdl``` ```bash #arg1: verification (0=no, 1=yes) #arg2: initialization (0=no init, 1=integer value, 2=decimal value) #arg3: run kernel # of times (>1) #arg4: num_dim_spatial(1|2|3) #arg5 to ...: N, K, C, [Z,] [Y,] X, [Di,] [Hi,] Wi, S[z,] [Sy,] Sx, [Dz,] [Dy,] Dx, [LeftPz,] [LeftPy,] LeftPx, [RightPy,] [RightPy,] RightPx -./bin/convnd_bwd_data_xdl 0 1 5 +./bin/example_convnd_bwd_data_xdl 0 1 5 ``` Result diff --git a/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp b/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp index 8db17f7398..60c66e621b 100644 --- a/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp +++ b/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp @@ -29,7 +29,7 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvBwdDefault = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default; using DeviceConvBwdDataBasePtr = ck::tensor_operation::device::DeviceConvBwdDataPtr; @@ -44,7 +44,7 @@ using DeviceConvNDBwdDataInstance = ck::tensor_operation::device:: InElementOp, // InElementwiseOperation WeiElementOp, // WeiElementwiseOperation OutElementOp, // OutElementwiseOperation - ConvBwdDefault, // ConvolutionBackwardDataSpecialization_t + ConvBwdDefault, // ConvolutionBackwardDataSpecialization NumDimSpatial, // NumDimSpatial 256, // BlockSize 128, // MPerBlock diff --git a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp index 8e30ef0c79..3f6a8a11ae 100644 --- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp +++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp @@ -40,7 +40,7 @@ using D0ReduceOp = ck::tensor_operation::element_wise::ReduceSum; using D1ReduceOp = ck::tensor_operation::element_wise::ReduceSquareSum; static constexpr auto GemmSpecialization = - ck::tensor_operation::device::GemmSpecialization_t::Default; + ck::tensor_operation::device::GemmSpecialization::Default; // clang-format off using DeviceBatchedGemmReduceInstance = ck::tensor_operation::device::DeviceBatchedGemmReduce_Xdl_CShuffle diff --git a/include/ck/config.hpp b/include/ck/config.hpp index 2390d5f26c..eedeb7e136 100644 --- a/include/ck/config.hpp +++ b/include/ck/config.hpp @@ -6,15 +6,9 @@ #include "hip/hip_fp16.h" #endif -// "Constant" address space for kernel parameter -#define CONSTANT __attribute__((address_space(4))) - -// GPU target -// should enable one and only one GPU target -#if !(defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \ - defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) || defined(CK_AMD_GPU_GFX1030)) -#error Need to define (only) one GPU target -#endif +// constant address space for kernel parameter +// https://llvm.org/docs/AMDGPUUsage.html#address-spaces +#define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4))) // launch bounds #define CK_USE_LAUNCH_BOUNDS 1 @@ -24,155 +18,134 @@ #define CK_MIN_BLOCK_PER_CU 2 #endif -// GPU-specific parameters -#if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \ - defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) -// buffer resourse +// check GPU target +#ifdef __HIP_DEVICE_COMPILE__ +#if !(defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \ + defined(__gfx90a__) || defined(__gfx1030__)) +#error Not supported target +#endif +#endif + +// buffer resourse, wave size +#ifndef __HIP_DEVICE_COMPILE__ // for host code +#define CK_BUFFER_RESOURCE_3RD_DWORD -1 +#define CK_GPU_WAVE_SIZE -1 +#elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \ + defined(__gfx90a__) // for GPU code #define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000 -// wave size #define CK_GPU_WAVE_SIZE 64 -#elif defined(CK_AMD_GPU_GFX1030) +#elif defined(__gfx1030__) // for GPU code #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000 #define CK_GPU_WAVE_SIZE 32 #endif // FMA instruction -#if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) +#ifndef __HIP_DEVICE_COMPILE__ // for host code, define nothing +#elif defined(__gfx803__) || defined(__gfx900__) // for GPU code #define CK_USE_AMD_V_MAC_F32 -#elif defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90a) || \ - defined(CK_AMD_GPU_GFX1030) +#elif defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || \ + defined(__gfx1030__) // for GPU code #define CK_USE_AMD_V_FMAC_F32 #define CK_USE_AMD_V_DOT2_F32_F16 #define CK_USE_AMD_V_DOT4_I32_I8 #endif -// multi index -#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0 - -// AMD inline asm -#ifndef CK_USE_AMD_INLINE_ASM -#define CK_USE_AMD_INLINE_ASM 1 +// MFMA instruction +#ifndef __HIP_DEVICE_COMPILE__ // for host code +#define CK_USE_AMD_MFMA +#elif defined(__gfx908__) || defined(__gfx90a__) // for GPU code +#define CK_USE_AMD_MFMA #endif -// AMD inner product (DLOP) -#ifndef CK_USE_AMD_INNER_PRODUCT_INLINE_ASM -#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1 +#if defined(__gfx90a__) +#define CK_USE_AMD_MFMA_BF16_1K_OP #endif -// AMD buffer_load -#ifndef CK_USE_AMD_BUFFER_LOAD +// buffer load #define CK_USE_AMD_BUFFER_LOAD 1 -#endif -// AMD buffer_store -#ifndef CK_USE_AMD_BUFFER_STORE +// buffer store #define CK_USE_AMD_BUFFER_STORE 1 + +// buffer atomic add: integer +#define CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER 1 + +// buffer atomic add: floating point +#ifndef __HIP_DEVICE_COMPILE__ // for host code +#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1 +#elif defined(__gfx908__) || defined(__gfx90a__) // for GPU code +#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1 +#else // for GPU code +#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0 #endif -// AMD buffer_atomic_add -#ifndef CK_USE_AMD_BUFFER_ATOMIC_ADD -#define CK_USE_AMD_BUFFER_ATOMIC_ADD 1 -#endif +// inline asm +#define CK_USE_AMD_INLINE_ASM 1 -// AMD XDLOPS -#ifndef CK_USE_AMD_XDLOPS -#define CK_USE_AMD_XDLOPS 0 -#endif +// inner product (DLOP) +#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1 // block synchronization only s_wait lgkmcnt(0), not vmcnt(0) -#ifndef CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM -#define CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1 -#endif +#define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1 -// experimental implementation for buffer load/store/atomic -#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0 -#endif - -#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK -#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1 -#endif - -#ifndef CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK -#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1 -#endif - -// experimental implementation for in-regsiter sub-dword transpose -#ifndef CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE -#define CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE 1 -#endif +// experimental feature: multi index implemented as array +#define CK_EXPERIMENTAL_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0 +// experimental feature: static tensor descriptor #define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0 -// merge transformation use magic number division -#ifndef CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION +// experimental feature: buffer load/store/atomic-add OOB trick +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0 +#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1 +#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1 + +// experimental feature: in-regsiter sub-dword transpose +#define CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE 1 + +// experimental feature: merge transformation use magic number division #define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1 -#endif -// use __builtin_memcpy instead of pointer cast to access a vector from pointer of scalar -#ifndef CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS +// experimental feature: use __builtin_memcpy instead of pointer cast to access a vector from +// pointer of scalar #define CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 0 -#endif -// use __builtin_memcpy instead of union to do bit_cast -#ifndef CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST +// experimental feature: use __builtin_memcpy instead of union to do bit_cast #define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1 -#endif // hack: have underlying assumption that need to be satsified, otherwise it's a bug // hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be // thread-invariant, otherwise it's a bug // TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread" -#ifndef CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE #define CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0 -#endif -// workaround for compiler crash when compiling recursive lambda -#ifndef CK_WORKAROUND_SWDEV_275126 +// workaround: compiler crash when compiling recursive lambda #define CK_WORKAROUND_SWDEV_275126 1 -#endif -// workaround for compiler crash when using buffer load/store for i8 -#ifndef CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE +// workaround: compiler crash when using buffer load/store for i8 #define CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE 1 -#endif -// workaround for compiler gnerating inefficient ds_write instructions -#ifndef CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE +// workaround: compiler gnerating inefficient ds_write instructions #define CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1 -#endif -// workaround for register spill due to compiler issue, when casting type between fp32 and fp16 -#ifndef CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R4_TYPE_CONVERT_ISSUE -#define CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R4_TYPE_CONVERT_ISSUE 1 -#endif - -#ifndef CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R5_TYPE_CONVERT_ISSUE -#define CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R5_TYPE_CONVERT_ISSUE 1 -#endif - -// workaround for verifaction failure, due to compiler regression, for conv bwd-data fp16 using some +// workaround: verifaction failure, due to compiler regression, for conv bwd-data fp16 using some // tuning parameter -#ifndef CK_WORKAROUND_SWDEV_325164 #define CK_WORKAROUND_SWDEV_325164 1 -#endif // workaround for verification failure ConvNd forward // https://github.com/ROCmSoftwarePlatform/composable_kernel/issues/135 -#ifndef CK_WORKAROUND_GITHUB_135 #define CK_WORKAROUND_GITHUB_135 1 -#endif namespace ck { -enum struct InMemoryDataOperationEnum_t +enum struct InMemoryDataOperationEnum { Set, AtomicAdd, Add }; -enum struct ActivTypeEnum_t +// TODO: no longer needed, remove this +enum struct ActivTypeEnum { None, LeakyRelu, diff --git a/include/ck/tensor/static_tensor.hpp b/include/ck/tensor/static_tensor.hpp index b1a816167a..2ca920df9d 100644 --- a/include/ck/tensor/static_tensor.hpp +++ b/include/ck/tensor/static_tensor.hpp @@ -4,7 +4,7 @@ namespace ck { // StaticTensor for Scalar -template ::type = false> @@ -255,7 +255,7 @@ __host__ __device__ constexpr auto make_static_tensor(TensorDesc) } template < - AddressSpaceEnum_t AddressSpace, + AddressSpaceEnum AddressSpace, typename T, typename TensorDesc, typename X, diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp index 35ff66a2b0..2a8a4bc8b8 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp @@ -207,9 +207,9 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2 CM0M1N0N1ThreadDesc{}.GetLength(I2) == N0, "wrong"); - auto a_thread_buf = make_static_buffer( + auto a_thread_buf = make_static_buffer( a_k_m0_m1_thread_desc_.GetElementSpaceSize()); - auto b_thread_buf = make_static_buffer( + auto b_thread_buf = make_static_buffer( b_k_n0_n1_thread_desc_.GetElementSpaceSize()); constexpr auto threadwise_gemm = diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r3.hpp index 26ca0bf111..0a7b8486f4 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r3.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r3.hpp @@ -220,9 +220,9 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B CThreadDesc_BM0_BM11_BN0_BN11{}.GetLength(I2) == BN0, "wrong"); - auto a_thread_buf = make_static_buffer( + auto a_thread_buf = make_static_buffer( a_thread_desc_bk0_bm0_bm1_bk1_.GetElementSpaceSize()); - auto b_thread_buf = make_static_buffer( + auto b_thread_buf = make_static_buffer( b_thread_desc_bk0_bn0_bn1_bk1_.GetElementSpaceSize()); constexpr auto threadwise_contraction = diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp index 3df0497f61..78cfc1e0fb 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp @@ -119,7 +119,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3 constexpr auto a_block_mtx = ABlockDesc_E1_K1_E2{}; // thread A buffer for GEMM - StaticBuffer + StaticBuffer a_thread_buf; constexpr auto threadwise_gemm = ThreadwiseGemmDlops_km_kn_mn_v3( + auto a_thread_buf = make_static_buffer( a_thread_desc_.GetElementSpaceSize()); - auto b_thread_buf = make_static_buffer( + auto b_thread_buf = make_static_buffer( b_thread_desc_.GetElementSpaceSize()); static_for<0, MRepeat, 1>{}([&](auto m0) { diff --git a/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v4r1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v4r1.hpp index aa37fc32f1..5aa6600848 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v4r1.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v4r1.hpp @@ -16,7 +16,7 @@ namespace ck { template {}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); } - else if constexpr(GemmSpecialization == GemmSpecialization_t::MPadding || - GemmSpecialization == GemmSpecialization_t::MKPadding) + else if constexpr(GemmSpec == GemmSpecialization::MPadding || + GemmSpec == GemmSpecialization::MKPadding) { // pad M, but not N return transform_tensor_descriptor( @@ -397,8 +397,8 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); } - else if constexpr(GemmSpecialization == GemmSpecialization_t::NPadding || - GemmSpecialization == GemmSpecialization_t::NKPadding) + else if constexpr(GemmSpec == GemmSpecialization::NPadding || + GemmSpec == GemmSpecialization::NKPadding) { // pad N, but not M return transform_tensor_descriptor( @@ -422,10 +422,10 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce"; if constexpr(ConvBackwardDataSpecialization == - ConvolutionBackwardDataSpecialization_t::Filter1x1Stride1Pad0){ + ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0){ str<< " Filter1x1Stride1Pad0"; } diff --git a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp index 4612e92de9..b13466274f 100644 --- a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp @@ -44,7 +44,7 @@ template {}, Sequence<1>{})); } else if constexpr(ConvForwardSpecialization == - ConvolutionForwardSpecialization_t::Filter1x1Pad0) + ConvolutionForwardSpecialization::Filter1x1Pad0) { const auto in_n_wi_c_grid_desc = make_naive_tensor_descriptor_packed(make_tuple(N, Wi, C)); @@ -262,7 +262,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K const index_t ConvStrideW = conv_filter_strides[1]; if constexpr(ConvForwardSpecialization == - ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0) + ConvolutionForwardSpecialization::Filter1x1Stride1Pad0) { const auto in_gemmmraw_gemmk_grid_desc = make_naive_tensor_descriptor_packed(make_tuple(gemm_m, gemm_k)); @@ -276,7 +276,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K make_tuple(Sequence<0, 2>{}, Sequence<1>{})); } else if constexpr(ConvForwardSpecialization == - ConvolutionForwardSpecialization_t::Filter1x1Pad0) + ConvolutionForwardSpecialization::Filter1x1Pad0) { const auto in_n_hi_wi_c_grid_desc = make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C)); @@ -395,7 +395,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K const index_t ConvStrideW = conv_filter_strides[2]; if constexpr(ConvForwardSpecialization == - ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0) + ConvolutionForwardSpecialization::Filter1x1Stride1Pad0) { const auto in_gemmmraw_gemmk_grid_desc = make_naive_tensor_descriptor_packed(make_tuple(gemm_m, gemm_k)); @@ -409,7 +409,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K make_tuple(Sequence<0, 2>{}, Sequence<1>{})); } else if constexpr(ConvForwardSpecialization == - ConvolutionForwardSpecialization_t::Filter1x1Pad0) + ConvolutionForwardSpecialization::Filter1x1Pad0) { const auto in_n_di_hi_wi_c_grid_desc = make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C)); @@ -613,7 +613,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K ABDataType, // TODO: distinguish A/B datatype AccDataType, CDataType, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, AGridDesc_K0_M_K1, BGridDesc_K0_N_K1, CGridDesc_M_N, @@ -878,7 +878,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K } if constexpr(ConvForwardSpecialization == - ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0) + ConvolutionForwardSpecialization::Filter1x1Stride1Pad0) { // check if it's 1x1, stride=1 conv for(ck::index_t i = 0; i < NumDimSpatial; ++i) @@ -891,7 +891,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K } } else if constexpr(ConvForwardSpecialization == - ConvolutionForwardSpecialization_t::Filter1x1Pad0) + ConvolutionForwardSpecialization::Filter1x1Pad0) { // check if it's 1x1 conv for(ck::index_t i = 0; i < NumDimSpatial; ++i) diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp index 7b31bf457d..8c02ddd3fd 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp @@ -29,7 +29,7 @@ template {}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); } - else if constexpr(GemmSpecialization == GemmSpecialization_t::MPadding || - GemmSpecialization == GemmSpecialization_t::MKPadding) + else if constexpr(GemmSpec == GemmSpecialization::MPadding || + GemmSpec == GemmSpecialization::MKPadding) { // pad M, but not N return transform_tensor_descriptor( @@ -321,8 +321,8 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); } - else if constexpr(GemmSpecialization == GemmSpecialization_t::NPadding || - GemmSpecialization == GemmSpecialization_t::NKPadding) + else if constexpr(GemmSpec == GemmSpecialization::NPadding || + GemmSpec == GemmSpecialization::NKPadding) { // pad N, but not M return transform_tensor_descriptor( @@ -346,10 +346,10 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); } - else if constexpr(GemmSpecialization == GemmSpecialization_t::MPadding || - GemmSpecialization == GemmSpecialization_t::MKPadding) + else if constexpr(GemmSpec == GemmSpecialization::MPadding || + GemmSpec == GemmSpecialization::MKPadding) { // pad M, but not N return transform_tensor_descriptor( @@ -310,8 +310,8 @@ struct DeviceGemm_Xdl_CShuffle make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); } - else if constexpr(GemmSpecialization == GemmSpecialization_t::NPadding || - GemmSpecialization == GemmSpecialization_t::NKPadding) + else if constexpr(GemmSpec == GemmSpecialization::NPadding || + GemmSpec == GemmSpecialization::NKPadding) { // pad N, but not M return transform_tensor_descriptor( @@ -340,7 +340,7 @@ struct DeviceGemm_Xdl_CShuffle AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp index f943111dc2..db6c884739 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp @@ -31,7 +31,7 @@ template {}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - if constexpr(GemmSpecialization == GemmSpecialization_t::MNPadding) + if constexpr(GemmSpec == GemmSpecialization::MNPadding) { const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock; return transform_tensor_descriptor( @@ -136,7 +136,7 @@ struct DeviceGemmXdlSplitK make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - if constexpr(GemmSpecialization == GemmSpecialization_t::MNPadding) + if constexpr(GemmSpec == GemmSpecialization::MNPadding) { const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock; return transform_tensor_descriptor( @@ -170,7 +170,7 @@ struct DeviceGemmXdlSplitK } }(); - if constexpr(GemmSpecialization == GemmSpecialization_t::MNPadding) + if constexpr(GemmSpec == GemmSpecialization::MNPadding) { const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock; const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock; @@ -209,7 +209,7 @@ struct DeviceGemmXdlSplitK ADataType, // TODO: distinguish A/B datatype AccDataType, CDataType, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, AGridDesc_K0_M_K1, BGridDesc_K0_N_K1, CGridDesc_M_N, @@ -250,7 +250,7 @@ struct DeviceGemmXdlSplitK ADataType, // TODO: distinguish A/B datatype AccDataType, CDataType, - InMemoryDataOperationEnum_t::AtomicAdd, + InMemoryDataOperationEnum::AtomicAdd, AGridDesc_K0_M_K1, BGridDesc_K0_N_K1, CGridDesc_M_N, diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp index f720960680..9de5361ab6 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp @@ -31,7 +31,7 @@ template {}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - if constexpr(GemmSpecialization == GemmSpecialization_t::MNPadding) + if constexpr(GemmSpec == GemmSpecialization::MNPadding) { const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock; return transform_tensor_descriptor( @@ -138,7 +138,7 @@ struct DeviceGemmXdlSplitKCShuffle make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - if constexpr(GemmSpecialization == GemmSpecialization_t::MNPadding) + if constexpr(GemmSpec == GemmSpecialization::MNPadding) { const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock; return transform_tensor_descriptor( @@ -172,7 +172,7 @@ struct DeviceGemmXdlSplitKCShuffle } }(); - if constexpr(GemmSpecialization == GemmSpecialization_t::MNPadding) + if constexpr(GemmSpec == GemmSpecialization::MNPadding) { const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock; const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock; @@ -211,7 +211,7 @@ struct DeviceGemmXdlSplitKCShuffle ADataType, // TODO: distinguish A/B datatype AccDataType, CDataType, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, AGridDesc_K0_M_K1, BGridDesc_K0_N_K1, CGridDesc_M_N, @@ -253,7 +253,7 @@ struct DeviceGemmXdlSplitKCShuffle ADataType, // TODO: distinguish A/B datatype AccDataType, CDataType, - InMemoryDataOperationEnum_t::AtomicAdd, + InMemoryDataOperationEnum::AtomicAdd, AGridDesc_K0_M_K1, BGridDesc_K0_N_K1, CGridDesc_M_N, diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp index 0c74f569c0..bebe2fd61e 100644 --- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp +++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp @@ -27,7 +27,7 @@ template +template struct DevicePool2dFwd : public BaseOperator { virtual std::unique_ptr @@ -29,7 +29,7 @@ struct DevicePool2dFwd : public BaseOperator virtual std::unique_ptr MakeInvokerPointer() = 0; }; -template +template using DevicePool2dFwdPtr = std::unique_ptr>; } // namespace device diff --git a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp index 84593cdb5e..651d31ae2f 100644 --- a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp +++ b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp @@ -16,7 +16,7 @@ namespace device { template +template struct reduce_binary_operator; template -struct reduce_binary_operator +struct reduce_binary_operator { using opType = reduce::Add; using dataType = T; @@ -50,7 +50,7 @@ struct reduce_binary_operator }; template -struct reduce_binary_operator +struct reduce_binary_operator { using opType = reduce::Mul; using dataType = T; @@ -59,7 +59,7 @@ struct reduce_binary_operator }; template -struct reduce_binary_operator +struct reduce_binary_operator { using opType = reduce::Min; using dataType = T; @@ -68,7 +68,7 @@ struct reduce_binary_operator }; template -struct reduce_binary_operator +struct reduce_binary_operator { using opType = reduce::Max; using dataType = T; @@ -77,7 +77,7 @@ struct reduce_binary_operator }; template -struct reduce_binary_operator +struct reduce_binary_operator { using opType = reduce::AMax; using dataType = T; @@ -86,7 +86,7 @@ struct reduce_binary_operator }; template -struct reduce_binary_operator +struct reduce_binary_operator { using opType = reduce::Add; using dataType = T; @@ -95,7 +95,7 @@ struct reduce_binary_operator }; template -struct reduce_binary_operator +struct reduce_binary_operator { using opType = reduce::Add; using dataType = T; @@ -104,7 +104,7 @@ struct reduce_binary_operator }; template -struct reduce_binary_operator +struct reduce_binary_operator { using opType = reduce::Add; using dataType = T; @@ -115,7 +115,7 @@ struct reduce_binary_operator // The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary // functor classes. // The two unary functors are called before and afer the Reduction is executed respectively -template +template struct reduce_unary_operator { using InElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; @@ -123,42 +123,42 @@ struct reduce_unary_operator }; template -struct reduce_unary_operator +struct reduce_unary_operator { using InElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; }; template -struct reduce_unary_operator +struct reduce_unary_operator { using InElementwiseOperation = tensor_operation::element_wise::UnaryAbs; using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; }; template -struct reduce_unary_operator +struct reduce_unary_operator { using InElementwiseOperation = tensor_operation::element_wise::UnaryAbs; using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; }; template -struct reduce_unary_operator +struct reduce_unary_operator { using InElementwiseOperation = tensor_operation::element_wise::UnarySquare; using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; }; template -struct reduce_unary_operator +struct reduce_unary_operator { using InElementwiseOperation = tensor_operation::element_wise::UnarySquare; using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt; }; template -struct reduce_unary_operator +struct reduce_unary_operator { using InElementwiseOperation = tensor_operation::element_wise::UnaryIdentic; using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp index 14fe0818a5..a81739fdeb 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp @@ -227,21 +227,18 @@ struct GridwiseReduction_mk_to_m_blockwise const auto zeroVal = ReduceOperation::GetReductionZeroVal(); - const auto in_global_buf = make_dynamic_buffer( + const auto in_global_buf = make_dynamic_buffer( p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert(zeroVal)); - auto out_global_buf = make_dynamic_buffer( + auto out_global_buf = make_dynamic_buffer( p_out_global, out_grid_desc_m.GetElementSpaceSize()); auto block_reduce_buf = - make_dynamic_buffer(p_block_reduce_buffer, BlockSize); + make_dynamic_buffer(p_block_reduce_buffer, BlockSize); - StaticBuffer + StaticBuffer in_thread_buf; - StaticBuffer accu_value_buf; + StaticBuffer accu_value_buf; static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; }); @@ -336,7 +333,7 @@ struct GridwiseReduction_mk_to_m_blockwise { if(!float_equal_zero{}(beta)) { - StaticBuffer + StaticBuffer priorDstValueBuf; auto threadwise_dst_load = @@ -376,7 +373,7 @@ struct GridwiseReduction_mk_to_m_blockwise Sequence<0>, 0, OutDstVectorSize, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, 1, true>( out_grid_desc_m, @@ -422,30 +419,26 @@ struct GridwiseReduction_mk_to_m_blockwise const auto zeroVal = ReduceOperation::GetReductionZeroVal(); - const auto in_global_buf = make_dynamic_buffer( + const auto in_global_buf = make_dynamic_buffer( p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert(zeroVal)); - auto out_global_val_buf = make_dynamic_buffer( + auto out_global_val_buf = make_dynamic_buffer( p_out_global, out_grid_desc_m.GetElementSpaceSize()); - auto out_global_idx_buf = make_dynamic_buffer( + auto out_global_idx_buf = make_dynamic_buffer( p_indices_global, out_grid_desc_m.GetElementSpaceSize()); auto block_reduce_val_buf = - make_dynamic_buffer(p_block_reduce_val_buffer, BlockSize); + make_dynamic_buffer(p_block_reduce_val_buffer, BlockSize); auto block_reduce_idx_buf = - make_dynamic_buffer(p_block_reduce_idx_buffer, BlockSize); + make_dynamic_buffer(p_block_reduce_idx_buffer, BlockSize); - StaticBuffer + StaticBuffer in_thread_val_buf; - StaticBuffer + StaticBuffer in_thread_idx_buf; - StaticBuffer accu_value_buf; - StaticBuffer - accu_index_buf; + StaticBuffer accu_value_buf; + StaticBuffer accu_index_buf; const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); @@ -561,7 +554,7 @@ struct GridwiseReduction_mk_to_m_blockwise { if(!float_equal_zero{}(beta)) { - StaticBuffer + StaticBuffer priorDstValueBuf; auto threadwise_dst_load = @@ -601,7 +594,7 @@ struct GridwiseReduction_mk_to_m_blockwise Sequence<0>, 0, OutDstVectorSize, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, 1, false>( out_grid_desc_m, @@ -619,7 +612,7 @@ struct GridwiseReduction_mk_to_m_blockwise Sequence<0>, 0, OutDstVectorSize, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, 1, false>( out_grid_desc_m, @@ -678,36 +671,32 @@ struct GridwiseReduction_mk_to_m_blockwise const auto zeroVal = ReduceOperation::GetReductionZeroVal(); const auto src_global_val_buf = - make_dynamic_buffer(p_ws_values_global, - in_grid_desc_m_k.GetElementSpaceSize(), - type_convert(zeroVal)); - const auto src_global_idx_buf = make_dynamic_buffer( + make_dynamic_buffer(p_ws_values_global, + in_grid_desc_m_k.GetElementSpaceSize(), + type_convert(zeroVal)); + const auto src_global_idx_buf = make_dynamic_buffer( p_ws_indices_global, in_grid_desc_m_k.GetElementSpaceSize()); - auto out_global_val_buf = make_dynamic_buffer( + auto out_global_val_buf = make_dynamic_buffer( p_out_global, out_grid_desc_m.GetElementSpaceSize()); - auto out_global_idx_buf = make_dynamic_buffer( + auto out_global_idx_buf = make_dynamic_buffer( p_indices_global, out_grid_desc_m.GetElementSpaceSize()); auto block_reduce_val_buf = - make_dynamic_buffer(p_block_reduce_val_buffer, BlockSize); + make_dynamic_buffer(p_block_reduce_val_buffer, BlockSize); auto block_reduce_idx_buf = - make_dynamic_buffer(p_block_reduce_idx_buffer, BlockSize); + make_dynamic_buffer(p_block_reduce_idx_buffer, BlockSize); - StaticBuffer + StaticBuffer in_thread_val_buf; - StaticBuffer in_thread_idx_buf; - StaticBuffer accu_value_buf; - StaticBuffer - accu_index_buf; + StaticBuffer accu_value_buf; + StaticBuffer accu_index_buf; const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); @@ -835,7 +824,7 @@ struct GridwiseReduction_mk_to_m_blockwise { if(!float_equal_zero{}(beta)) { - StaticBuffer + StaticBuffer priorDstValueBuf; auto threadwise_dst_load = @@ -875,7 +864,7 @@ struct GridwiseReduction_mk_to_m_blockwise Sequence<0>, 0, OutDstVectorSize, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, 1, true>( out_grid_desc_m, @@ -893,7 +882,7 @@ struct GridwiseReduction_mk_to_m_blockwise Sequence<0>, 0, OutDstVectorSize, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, 1, true>( out_grid_desc_m, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp index 6a46135a33..2d54e84954 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp @@ -140,21 +140,18 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add // LDS __shared__ AccDataType p_block_reduce_buffer[BlockSize]; - const auto in_global_buf = make_dynamic_buffer( + const auto in_global_buf = make_dynamic_buffer( p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert(zeroVal)); - auto out_global_buf = make_dynamic_buffer( + auto out_global_buf = make_dynamic_buffer( p_out_global, out_grid_desc_m.GetElementSpaceSize()); auto block_reduce_buf = - make_dynamic_buffer(p_block_reduce_buffer, BlockSize); + make_dynamic_buffer(p_block_reduce_buffer, BlockSize); - StaticBuffer + StaticBuffer in_thread_buf; - StaticBuffer accu_value_buf; + StaticBuffer accu_value_buf; static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; }); @@ -259,7 +256,7 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add Sequence<0>, 0, OutDstVectorSize, - InMemoryDataOperationEnum_t::AtomicAdd, + InMemoryDataOperationEnum::AtomicAdd, 1, true>( out_grid_desc_m, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp index 0c76794754..bab95cf4d0 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp @@ -163,22 +163,19 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce __shared__ AccDataType p_block_reduce_buffer[BlockSize]; const auto in_global_buf = - make_dynamic_buffer(p_src_global, - in_grid_desc_m_k.GetElementSpaceSize(), - type_convert(zeroVal)); - auto workspace_global_buf = make_dynamic_buffer( + make_dynamic_buffer(p_src_global, + in_grid_desc_m_k.GetElementSpaceSize(), + type_convert(zeroVal)); + auto workspace_global_buf = make_dynamic_buffer( p_ws_values_global, workspace_desc_m_k.GetElementSpaceSize()); auto block_reduce_buf = - make_dynamic_buffer(p_block_reduce_buffer, BlockSize); + make_dynamic_buffer(p_block_reduce_buffer, BlockSize); - StaticBuffer + StaticBuffer in_thread_buf; - StaticBuffer accu_value_buf; + StaticBuffer accu_value_buf; static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; }); @@ -272,7 +269,7 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce Sequence<0, 1>, 1, 1, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, 1, true>( workspace_desc_m_k, @@ -322,33 +319,29 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce __shared__ index_t p_block_reduce_idx_buffer[BlockSize]; const auto in_global_buf = - make_dynamic_buffer(p_src_global, - in_grid_desc_m_k.GetElementSpaceSize(), - type_convert(zeroVal)); - auto workspace_global_val_buf = make_dynamic_buffer( + make_dynamic_buffer(p_src_global, + in_grid_desc_m_k.GetElementSpaceSize(), + type_convert(zeroVal)); + auto workspace_global_val_buf = make_dynamic_buffer( p_ws_values_global, workspace_desc_m_k.GetElementSpaceSize()); - auto workspace_global_idx_buf = make_dynamic_buffer( + auto workspace_global_idx_buf = make_dynamic_buffer( p_ws_indices_global, workspace_desc_m_k.GetElementSpaceSize()); auto block_reduce_val_buf = - make_dynamic_buffer(p_block_reduce_val_buffer, BlockSize); + make_dynamic_buffer(p_block_reduce_val_buffer, BlockSize); auto block_reduce_idx_buf = - make_dynamic_buffer(p_block_reduce_idx_buffer, BlockSize); + make_dynamic_buffer(p_block_reduce_idx_buffer, BlockSize); - StaticBuffer + StaticBuffer in_thread_val_buf; - StaticBuffer in_thread_idx_buf; - StaticBuffer accu_value_buf; - StaticBuffer - accu_index_buf; + StaticBuffer accu_value_buf; + StaticBuffer accu_index_buf; const index_t thread_local_id = get_thread_local_1d_id(); const index_t block_global_id = get_block_1d_id(); @@ -461,7 +454,7 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce Sequence<0, 1>, 1, 1, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, 1, true>( workspace_desc_m_k, @@ -480,7 +473,7 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce Sequence<0, 1>, 1, 1, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, 1, true>( workspace_desc_m_k, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp index 86caea2a92..8a4985595b 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp @@ -132,18 +132,15 @@ struct GridwiseReduction_mk_to_m_threadwise const auto zeroVal = ReduceOperation::GetReductionZeroVal(); - const auto in_global_buf = make_dynamic_buffer( + const auto in_global_buf = make_dynamic_buffer( p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert(zeroVal)); - auto dst_global_buf = make_dynamic_buffer( + auto dst_global_buf = make_dynamic_buffer( p_out_global, out_grid_desc_m.GetElementSpaceSize()); - StaticBuffer + StaticBuffer in_thread_buf; - StaticBuffer accu_value_buf; + StaticBuffer accu_value_buf; static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; }); @@ -223,7 +220,7 @@ struct GridwiseReduction_mk_to_m_threadwise true>( out_grid_desc_m, make_multi_index(thread_global_1d_id * MThreadSliceSize)); - StaticBuffer + StaticBuffer priorDstValue_buf; threadwise_dst_load.Run(out_grid_desc_m, @@ -248,7 +245,7 @@ struct GridwiseReduction_mk_to_m_threadwise Sequence<0>, 0, OutDstVectorSize, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, 1, false>( out_grid_desc_m, @@ -277,22 +274,18 @@ struct GridwiseReduction_mk_to_m_threadwise const auto zeroVal = ReduceOperation::GetReductionZeroVal(); - const auto in_global_buf = make_dynamic_buffer( + const auto in_global_buf = make_dynamic_buffer( p_in_global, in_grid_desc_m_k.GetElementSpaceSize(), type_convert(zeroVal)); - auto out_global_val_buf = make_dynamic_buffer( + auto out_global_val_buf = make_dynamic_buffer( p_out_global, out_grid_desc_m.GetElementSpaceSize()); - auto out_global_idx_buf = make_dynamic_buffer( + auto out_global_idx_buf = make_dynamic_buffer( p_indices_global, out_grid_desc_m.GetElementSpaceSize()); - StaticBuffer + StaticBuffer in_thread_buf; - StaticBuffer accu_value_buf; - StaticBuffer - accu_index_buf; + StaticBuffer accu_value_buf; + StaticBuffer accu_index_buf; static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = zeroVal; @@ -382,7 +375,7 @@ struct GridwiseReduction_mk_to_m_threadwise false>( out_grid_desc_m, make_multi_index(thread_global_1d_id * MThreadSliceSize)); - StaticBuffer + StaticBuffer priorDstValue_buf; threadwise_dst_load.Run(out_grid_desc_m, @@ -407,7 +400,7 @@ struct GridwiseReduction_mk_to_m_threadwise Sequence<0>, 0, OutDstVectorSize, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, 1, false>( out_grid_desc_m, @@ -424,7 +417,7 @@ struct GridwiseReduction_mk_to_m_threadwise Sequence<0>, 0, OutDstVectorSize, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, 1, false>( out_grid_desc_m, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp index 50e8f52c59..a9b6d8dfa0 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp @@ -55,7 +55,7 @@ template , integral_constant) { - const auto a_global_buf = make_dynamic_buffer( + const auto a_global_buf = make_dynamic_buffer( p_a_grid, a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize()); - const auto b_global_buf = make_dynamic_buffer( + const auto b_global_buf = make_dynamic_buffer( p_b_grid, b_grid_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize()); - auto c_grid_buf = make_dynamic_buffer( + auto c_grid_buf = make_dynamic_buffer( p_c_grid, c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetElementSpaceSize()); const auto GK0 = a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetLength(I0); @@ -383,7 +383,7 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN // A matrix blockwise copy auto a_blockwise_copy = BlockwiseTensorSliceTransfer_v5r1< BlockSize, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, Sequence, ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1, ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1, @@ -407,7 +407,7 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN // B matrix blockwise copy auto b_blockwise_copy = BlockwiseTensorSliceTransfer_v5r1< BlockSize, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, Sequence, BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1, BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1, @@ -467,7 +467,7 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN FloatAB* p_b_block_double = p_shared_block + 2 * a_block_aligned_space_size; // register allocation for output - auto c_thread_buf = make_static_buffer( + auto c_thread_buf = make_static_buffer( c_thread_desc_bm0_bm1_bn0_bn1.GetElementSpaceSize()); ThreadwiseTensorSliceSet_v1( + auto a_block_even_buf = make_dynamic_buffer( p_a_block_double, a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize()); - auto b_block_even_buf = make_dynamic_buffer( + auto b_block_even_buf = make_dynamic_buffer( p_b_block_double, b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize()); - auto a_block_odd_buf = make_dynamic_buffer( + auto a_block_odd_buf = make_dynamic_buffer( p_a_block_double + a_block_aligned_space_size, a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize()); - auto b_block_odd_buf = make_dynamic_buffer( + auto b_block_odd_buf = make_dynamic_buffer( p_b_block_double + b_block_aligned_space_size, b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize()); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp index d758309c24..a7ff81e209 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp @@ -55,7 +55,7 @@ template , integral_constant) { - const auto a_global_buf = make_dynamic_buffer( + const auto a_global_buf = make_dynamic_buffer( p_a_grid, a_k_m0_m1_grid_desc.GetElementSpaceSize()); - const auto b_global_buf = make_dynamic_buffer( + const auto b_global_buf = make_dynamic_buffer( p_b_grid, b_k_n0_n1_grid_desc.GetElementSpaceSize()); - auto c_grid_buf = make_dynamic_buffer( + auto c_grid_buf = make_dynamic_buffer( p_c_grid, c_m0_m10_m11_n0_n10_n11_grid_desc.GetElementSpaceSize()); const auto K = a_k_m0_m1_grid_desc.GetLength(I0); @@ -315,7 +315,7 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2 // A matrix blockwise copy auto a_blockwise_copy = BlockwiseTensorSliceTransfer_v4, ABlockTransferThreadSliceLengths_K_M0_M1, ABlockTransferThreadClusterLengths_K_M0_M1, @@ -341,7 +341,7 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2 // B matrix blockwise copy auto b_blockwise_copy = BlockwiseTensorSliceTransfer_v4, BBlockTransferThreadSliceLengths_K_N0_N1, BBlockTransferThreadClusterLengths_K_N0_N1, @@ -403,7 +403,7 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2 FloatAB* p_b_block_double = p_shared_block + 2 * a_block_aligned_space_size; // register allocation for output - auto c_thread_buf = make_static_buffer( + auto c_thread_buf = make_static_buffer( c_m10_m11_n10_n11_thread_desc.GetElementSpaceSize()); ThreadwiseTensorSliceSet_v1( + auto a_block_even_buf = make_dynamic_buffer( p_a_block_double, a_k_m0_m1_block_desc.GetElementSpaceSize()); - auto b_block_even_buf = make_dynamic_buffer( + auto b_block_even_buf = make_dynamic_buffer( p_b_block_double, b_k_n0_n1_block_desc.GetElementSpaceSize()); - auto a_block_odd_buf = make_dynamic_buffer( + auto a_block_odd_buf = make_dynamic_buffer( p_a_block_double + a_block_aligned_space_size, a_k_m0_m1_block_desc.GetElementSpaceSize()); - auto b_block_odd_buf = make_dynamic_buffer( + auto b_block_odd_buf = make_dynamic_buffer( p_b_block_double + b_block_aligned_space_size, b_k_n0_n1_block_desc.GetElementSpaceSize()); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r3.hpp index 4a7db509ed..1a66c8ff3f 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r3.hpp @@ -55,7 +55,7 @@ template , integral_constant) { - const auto a_global_buf = make_dynamic_buffer( + const auto a_global_buf = make_dynamic_buffer( p_a_grid, a_k0_m0_m1_k1_grid_desc.GetElementSpaceSize()); - const auto b_global_buf = make_dynamic_buffer( + const auto b_global_buf = make_dynamic_buffer( p_b_grid, b_k0_n0_n1_k1_grid_desc.GetElementSpaceSize()); - auto c_grid_buf = make_dynamic_buffer( + auto c_grid_buf = make_dynamic_buffer( p_c_grid, c_m0_m10_m11_n0_n10_n11_grid_desc.GetElementSpaceSize()); // divide block work by [M, N] @@ -325,7 +325,7 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3 // A matrix blockwise copy auto a_blockwise_copy = BlockwiseTensorSliceTransfer_v5r1< BlockSize, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, Sequence, ABlockTransferThreadSliceLengths_K0_M0_M1_K1, ABlockTransferThreadClusterLengths_K0_M0_M1_K1, @@ -349,7 +349,7 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3 // B matrix blockwise copy auto b_blockwise_copy = BlockwiseTensorSliceTransfer_v5r1< BlockSize, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, Sequence, BBlockTransferThreadSliceLengths_K0_N0_N1_K1, BBlockTransferThreadClusterLengths_K0_N0_N1_K1, @@ -409,7 +409,7 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3 FloatAB* p_b_block_double = p_shared_block + 2 * a_block_aligned_space_size; // register allocation for output - auto c_thread_buf = make_static_buffer( + auto c_thread_buf = make_static_buffer( c_m10_m11_n10_n11_thread_desc.GetElementSpaceSize()); ThreadwiseTensorSliceSet_v1( + auto a_block_even_buf = make_dynamic_buffer( p_a_block_double, a_k0_m0_m1_k1_block_desc.GetElementSpaceSize()); - auto b_block_even_buf = make_dynamic_buffer( + auto b_block_even_buf = make_dynamic_buffer( p_b_block_double, b_k0_n0_n1_k1_block_desc.GetElementSpaceSize()); - auto a_block_odd_buf = make_dynamic_buffer( + auto a_block_odd_buf = make_dynamic_buffer( p_a_block_double + a_block_aligned_space_size, a_k0_m0_m1_k1_block_desc.GetElementSpaceSize()); - auto b_block_odd_buf = make_dynamic_buffer( + auto b_block_odd_buf = make_dynamic_buffer( p_b_block_double + b_block_aligned_space_size, b_k0_n0_n1_k1_block_desc.GetElementSpaceSize()); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v2.hpp index 84ee6f40ec..607a05d156 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v2.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v2.hpp @@ -15,7 +15,7 @@ template {}; constexpr auto I3 = Number<3>{}; - const auto a_global_buf = make_dynamic_buffer( + const auto a_global_buf = make_dynamic_buffer( p_a_global, a_e_k_global_desc.GetElementSpaceSize()); - const auto b_global_buf = make_dynamic_buffer( + const auto b_global_buf = make_dynamic_buffer( p_b_global, b_e_n_ho_wo_global_desc.GetElementSpaceSize()); - auto c_global_buf = make_dynamic_buffer( + auto c_global_buf = make_dynamic_buffer( p_c_global, c_k_n_ho_wo_global_desc.GetElementSpaceSize()); constexpr auto E = EPerBlock * 3 * 3; @@ -181,7 +181,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 // A matrix blockwise copy auto a_blockwise_copy = BlockwiseTensorSliceTransfer_v4, ABlockTransferThreadSliceLengths_E_K, ABlockTransferThreadClusterLengths_E_K, @@ -221,11 +221,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3 b_e_n_ho_wo_global_desc, make_multi_index(0, 0, ho_thread_data_on_global, wo_thread_data_on_global)); - auto a_block_buf = make_dynamic_buffer( + auto a_block_buf = make_dynamic_buffer( p_shared_block, a_e_k_desc.GetElementSpaceSize()); // register allocation for output - StaticBuffer @@ -250,7 +250,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 BGlobalMoveSliceWindowStepHacks{}; // double regsiter buffer for b - StaticBuffer diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v3.hpp index 0b62fcd554..a36b5e53ce 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v3.hpp @@ -20,7 +20,7 @@ template + ActivTypeEnum ActivType> __global__ void #if CK_USE_LAUNCH_BOUNDS __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) @@ -50,7 +50,7 @@ __global__ void c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, cblockid_to_k_n_h_w_block_cluster_adaptor, integral_constant{}, - integral_constant{}); + integral_constant{}); } template + ActivTypeEnum ActivType> __global__ void #if CK_USE_LAUNCH_BOUNDS __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) @@ -94,7 +94,7 @@ __global__ void d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, cblockid_to_k_n_h_w_block_cluster_adaptor, integral_constant{}, - integral_constant{}); + integral_constant{}); } template + ActivTypeEnum ActivType> __global__ void #if CK_USE_LAUNCH_BOUNDS __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) @@ -140,14 +140,14 @@ __global__ void d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, cblockid_to_k_n_h_w_block_cluster_adaptor, integral_constant{}, - integral_constant{}); + integral_constant{}); } template {})); - StaticBuffer @@ -602,10 +602,10 @@ struct GridwiseGemmDlops_km_kn_mn_v3 }); } - template + template __device__ static void Activation(CThreadBuff& c_thread_buf, const CThreadDesc_K1_N_H2_W2&, - integral_constant) + integral_constant) { constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{}; @@ -737,7 +737,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 I1, Number{})); - StaticBuffer @@ -783,7 +783,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 CThreadTransferSrcDstAccessOrder, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, 1, true>(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, make_multi_index(k_block_work_id, @@ -843,7 +843,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 I1, Number{})); - StaticBuffer @@ -874,7 +874,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 CThreadTransferSrcDstAccessOrder, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector, - InMemoryDataOperationEnum_t::Add, + InMemoryDataOperationEnum::Add, 1, true>(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, make_multi_index(k_block_work_id, @@ -964,7 +964,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 // A matrix blockwise copy auto a_blockwise_copy = BlockwiseTensorSliceTransfer_v4, ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2, ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2, @@ -1023,11 +1023,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3 0, 0)); - auto a_block_buf = make_dynamic_buffer( + auto a_block_buf = make_dynamic_buffer( p_shared_block, a_e0_e1_k0_k1_e2_block_copy_desc.GetElementSpaceSize()); //// register allocation for output - // StaticBuffer @@ -1050,7 +1050,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks = BGlobalStepHacks{}; // double regsiter buffer for b - StaticBuffer @@ -1294,21 +1294,21 @@ struct GridwiseGemmDlops_km_kn_mn_v3 const auto bias_k0_k1_grid_desc = MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc); - const auto a_global_buf = make_dynamic_buffer( + const auto a_global_buf = make_dynamic_buffer( p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize()); - const auto b_global_buf = make_dynamic_buffer( + const auto b_global_buf = make_dynamic_buffer( p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize()); - auto c_global_buf = make_dynamic_buffer( + auto c_global_buf = make_dynamic_buffer( p_c_global, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetElementSpaceSize()); - auto d_global_buf = make_dynamic_buffer( + auto d_global_buf = make_dynamic_buffer( p_d_global, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.GetElementSpaceSize()); - auto bias_global_buf = make_dynamic_buffer( + auto bias_global_buf = make_dynamic_buffer( p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize()); constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor(); // register allocation for output - StaticBuffer @@ -1344,7 +1344,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2, typename CBlockIdToBlockClusterAdaptor_K_N_H_W, bool HasMainE0BlockLoop, - ActivTypeEnum_t ActivType> + ActivTypeEnum ActivType> __device__ static void ConvBiasActiv( const FloatAB* __restrict__ p_a_global, const FloatAB* __restrict__ p_b_global, @@ -1356,26 +1356,26 @@ struct GridwiseGemmDlops_km_kn_mn_v3 const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, const CBlockIdToBlockClusterAdaptor_K_N_H_W& cblockid_to_k_n_h_w_block_cluster_adaptor, integral_constant, - integral_constant) + integral_constant) { - static constexpr auto activ_type = integral_constant{}; + static constexpr auto activ_type = integral_constant{}; const auto bias_k0_k1_grid_desc = MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc); - const auto a_global_buf = make_dynamic_buffer( + const auto a_global_buf = make_dynamic_buffer( p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize()); - const auto b_global_buf = make_dynamic_buffer( + const auto b_global_buf = make_dynamic_buffer( p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize()); - auto c_global_buf = make_dynamic_buffer( + auto c_global_buf = make_dynamic_buffer( p_c_global, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetElementSpaceSize()); - auto bias_global_buf = make_dynamic_buffer( + auto bias_global_buf = make_dynamic_buffer( p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize()); constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor(); // register allocation for output - StaticBuffer @@ -1423,7 +1423,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx, typename CBlockIdToBlockClusterAdaptor_K_N_H_W, bool HasMainE0BlockLoop, - ActivTypeEnum_t ActivType> + ActivTypeEnum ActivType> __device__ static void ConvBiasActivMaxpool( const FloatAB* __restrict__ p_a_global, const FloatAB* __restrict__ p_b_global, @@ -1437,28 +1437,28 @@ struct GridwiseGemmDlops_km_kn_mn_v3 const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, const CBlockIdToBlockClusterAdaptor_K_N_H_W& cblockid_to_k_n_h_w_block_cluster_adaptor, integral_constant, - integral_constant) + integral_constant) { - static constexpr auto activ_type = integral_constant{}; + static constexpr auto activ_type = integral_constant{}; const auto bias_k0_k1_grid_desc = MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc); - const auto a_global_buf = make_dynamic_buffer( + const auto a_global_buf = make_dynamic_buffer( p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize()); - const auto b_global_buf = make_dynamic_buffer( + const auto b_global_buf = make_dynamic_buffer( p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize()); - auto c_global_buf = make_dynamic_buffer( + auto c_global_buf = make_dynamic_buffer( p_c_global, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetElementSpaceSize()); - auto d_global_buf = make_dynamic_buffer( + auto d_global_buf = make_dynamic_buffer( p_d_global, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.GetElementSpaceSize()); - auto bias_global_buf = make_dynamic_buffer( + auto bias_global_buf = make_dynamic_buffer( p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize()); constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor(); // register allocation for output - StaticBuffer @@ -1514,7 +1514,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx, typename CBlockIdToBlockClusterAdaptor_K_N_H_W, bool HasMainE0BlockLoop, - ActivTypeEnum_t ActivType> + ActivTypeEnum ActivType> __device__ static void ConvBiasActivResizeAdd( const FloatAB* __restrict__ p_a_global, const FloatAB* __restrict__ p_b_global, @@ -1527,26 +1527,26 @@ struct GridwiseGemmDlops_km_kn_mn_v3 const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, const CBlockIdToBlockClusterAdaptor_K_N_H_W& cblockid_to_k_n_h_w_block_cluster_adaptor, integral_constant, - integral_constant) + integral_constant) { - static constexpr auto activ_type = integral_constant{}; + static constexpr auto activ_type = integral_constant{}; const auto bias_k0_k1_grid_desc = MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc); - const auto a_global_buf = make_dynamic_buffer( + const auto a_global_buf = make_dynamic_buffer( p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize()); - const auto b_global_buf = make_dynamic_buffer( + const auto b_global_buf = make_dynamic_buffer( p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize()); - auto d_global_buf = make_dynamic_buffer( + auto d_global_buf = make_dynamic_buffer( p_d_global, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.GetElementSpaceSize()); - auto bias_global_buf = make_dynamic_buffer( + auto bias_global_buf = make_dynamic_buffer( p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize()); constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor(); // register allocation for output - StaticBuffer diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp index 8f75e013e9..87f955e88d 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp @@ -79,8 +79,8 @@ template ( + const auto a_grid_buf = make_dynamic_buffer( p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize()); - const auto b_grid_buf = make_dynamic_buffer( + const auto b_grid_buf = make_dynamic_buffer( p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize()); - auto c_grid_buf = make_dynamic_buffer( + auto c_grid_buf = make_dynamic_buffer( p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); - auto d0_grid_buf = make_dynamic_buffer( + auto d0_grid_buf = make_dynamic_buffer( p_d0_grid, d_grid_desc_mblock_mperblock.GetElementSpaceSize()); - auto d1_grid_buf = make_dynamic_buffer( + auto d1_grid_buf = make_dynamic_buffer( p_d1_grid, d_grid_desc_mblock_mperblock.GetElementSpaceSize()); // divide block work by [M, N] @@ -399,7 +399,7 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1 BlockwiseTensorSliceTransfer_v4r1, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, @@ -430,7 +430,7 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1 BlockwiseTensorSliceTransfer_v4r1, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, @@ -484,10 +484,10 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1 constexpr auto a_block_space_size_aligned = math::integer_least_multiple( a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align); - auto a_block_buf = make_dynamic_buffer( + auto a_block_buf = make_dynamic_buffer( static_cast(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize()); - auto b_block_buf = make_dynamic_buffer( + auto b_block_buf = make_dynamic_buffer( static_cast(p_shared) + a_block_space_size_aligned, b_block_desc_bk0_n_bk1.GetElementSpaceSize()); @@ -563,7 +563,7 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1 constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock = GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(); - auto c_shuffle_block_buf = make_dynamic_buffer( + auto c_shuffle_block_buf = make_dynamic_buffer( static_cast(p_shared), c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); @@ -632,7 +632,7 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1 Sequence<0, 1, 2, 3, 4, 5, 6, 7>, 7, 1, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, 1, true>{ c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2, @@ -723,13 +723,13 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1 make_naive_tensor_descriptor_packed(make_tuple(I1, Number{})); // TODO: this should be implemented as a blockwise reduction - auto c_reduce_thread_buf = make_static_buffer( + auto c_reduce_thread_buf = make_static_buffer( c_reduce_thread_desc_mperblock_nperblock.GetElementSpaceSize()); - auto d0_thread_buf = make_static_buffer( + auto d0_thread_buf = make_static_buffer( d_reduce_thread_desc_mperblock.GetElementSpaceSize()); - auto d1_thread_buf = make_static_buffer( + auto d1_thread_buf = make_static_buffer( d_reduce_thread_desc_mperblock.GetElementSpaceSize()); // reduce: threadwise copy from LDS to VGPR diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp index 0284bbd55e..6142f1f048 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp @@ -60,7 +60,7 @@ template ( + const auto a_grid_buf = make_dynamic_buffer( p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize()); - const auto b_grid_buf = make_dynamic_buffer( + const auto b_grid_buf = make_dynamic_buffer( p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize()); - auto c_grid_buf = make_dynamic_buffer( + auto c_grid_buf = make_dynamic_buffer( p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); // divide block work by [M, N] @@ -348,7 +348,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1 BlockwiseTensorSliceTransfer_v4r1, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, @@ -379,7 +379,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1 BlockwiseTensorSliceTransfer_v4r1, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, @@ -433,10 +433,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1 constexpr auto a_block_space_size_aligned = math::integer_least_multiple( a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align); - auto a_block_buf = make_dynamic_buffer( + auto a_block_buf = make_dynamic_buffer( static_cast(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize()); - auto b_block_buf = make_dynamic_buffer( + auto b_block_buf = make_dynamic_buffer( static_cast(p_shared) + a_block_space_size_aligned, b_block_desc_bk0_n_bk1.GetElementSpaceSize()); @@ -512,7 +512,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1 constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock = GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(); - auto c_shuffle_block_buf = make_dynamic_buffer( + auto c_shuffle_block_buf = make_dynamic_buffer( static_cast(p_shared), c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); @@ -581,7 +581,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1 Sequence<0, 1, 2, 3, 4, 5, 6, 7>, 7, 1, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, 1, true>{ c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp index 9ce5b3dae6..c2f2b7bd15 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp @@ -132,7 +132,7 @@ template ( + const auto a_grid_buf = make_dynamic_buffer( p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize()); - const auto b_grid_buf = make_dynamic_buffer( + const auto b_grid_buf = make_dynamic_buffer( p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize()); - auto c_grid_buf = make_dynamic_buffer( + auto c_grid_buf = make_dynamic_buffer( p_c_grid, c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetElementSpaceSize()); const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0); @@ -460,7 +460,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 BlockwiseTensorSliceTransfer_v4r1, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, @@ -491,7 +491,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 BlockwiseTensorSliceTransfer_v4r1, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, @@ -543,10 +543,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 constexpr auto a_block_space_size_aligned = math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align); - auto a_block_buf = make_dynamic_buffer( + auto a_block_buf = make_dynamic_buffer( static_cast(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize()); - auto b_block_buf = make_dynamic_buffer( + auto b_block_buf = make_dynamic_buffer( static_cast(p_shared) + a_block_space_size_aligned, b_block_desc_k0_n_k1.GetElementSpaceSize()); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp index ede928e02a..51a60d7365 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp @@ -59,7 +59,7 @@ template ( + const auto a_grid_buf = make_dynamic_buffer( p_a_grid, a_b_k0_m_k1_grid_desc.GetElementSpaceSize()); - const auto b_grid_buf = make_dynamic_buffer( + const auto b_grid_buf = make_dynamic_buffer( p_b_grid, b_b_k0_n_k1_grid_desc.GetElementSpaceSize()); - auto c_grid_buf = make_dynamic_buffer( + auto c_grid_buf = make_dynamic_buffer( p_c_grid, c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc.GetElementSpaceSize()); const auto K0 = a_b_k0_m_k1_grid_desc.GetLength(I1); @@ -410,7 +410,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4 BlockwiseTensorSliceTransfer_v4r1, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, @@ -440,7 +440,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4 BlockwiseTensorSliceTransfer_v4r1, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, @@ -497,9 +497,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4 constexpr auto a_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0); constexpr auto b_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0); - auto a_block_buf = make_dynamic_buffer( + auto a_block_buf = make_dynamic_buffer( p_a_block, a_k0_m_k1_block_desc.GetElementSpaceSize()); - auto b_block_buf = make_dynamic_buffer( + auto b_block_buf = make_dynamic_buffer( p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize()); // preload data into LDS diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp index d51ebf7faa..f192e599c9 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp @@ -61,7 +61,7 @@ template ( + const auto a_grid_buf = make_dynamic_buffer( p_a_grid, a_b_k0_m_k1_grid_desc.GetElementSpaceSize()); - const auto b_grid_buf = make_dynamic_buffer( + const auto b_grid_buf = make_dynamic_buffer( p_b_grid, b_b_k0_n_k1_grid_desc.GetElementSpaceSize()); - auto c_grid_buf = make_dynamic_buffer( + auto c_grid_buf = make_dynamic_buffer( p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); const auto K0 = a_b_k0_m_k1_grid_desc.GetLength(I1); @@ -399,7 +399,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 BlockwiseTensorSliceTransfer_v4r1, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, @@ -429,7 +429,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 BlockwiseTensorSliceTransfer_v4r1, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, @@ -486,9 +486,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 constexpr auto a_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0); constexpr auto b_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0); - auto a_block_buf = make_dynamic_buffer( + auto a_block_buf = make_dynamic_buffer( p_a_block, a_k0_m_k1_block_desc.GetElementSpaceSize()); - auto b_block_buf = make_dynamic_buffer( + auto b_block_buf = make_dynamic_buffer( p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize()); // preload data into LDS @@ -560,7 +560,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 constexpr auto c_block_desc_mblock_mperblock_nblock_nperblock = GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(); - auto c_block_buf = make_dynamic_buffer( + auto c_block_buf = make_dynamic_buffer( static_cast(p_shared_block), c_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); @@ -632,7 +632,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 Sequence<0, 1, 2, 3, 4, 5, 6, 7>, 7, 1, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, 1, true>{ c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp index bf89bfe681..64fe857a03 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp @@ -64,7 +64,7 @@ template < typename FloatAcc, typename FloatCShuffle, typename FloatC, - InMemoryDataOperationEnum_t CGlobalMemoryDataOperation, + InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, @@ -369,11 +369,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 const CElementwiseOperation& c_element_op, const Block2CTileMap& block_2_ctile_map) { - const auto a_grid_buf = make_dynamic_buffer( + const auto a_grid_buf = make_dynamic_buffer( p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize()); - const auto b_grid_buf = make_dynamic_buffer( + const auto b_grid_buf = make_dynamic_buffer( p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize()); - auto c_grid_buf = make_dynamic_buffer( + auto c_grid_buf = make_dynamic_buffer( p_c_grid, c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl .GetElementSpaceSize()); @@ -403,7 +403,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 BlockwiseTensorSliceTransfer_v4r1, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, @@ -434,7 +434,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 BlockwiseTensorSliceTransfer_v4r1, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, @@ -488,10 +488,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 constexpr auto a_block_space_size_aligned = math::integer_least_multiple( a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align); - auto a_block_buf = make_dynamic_buffer( + auto a_block_buf = make_dynamic_buffer( static_cast(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize()); - auto b_block_buf = make_dynamic_buffer( + auto b_block_buf = make_dynamic_buffer( static_cast(p_shared) + a_block_space_size_aligned, b_block_desc_bk0_n_bk1.GetElementSpaceSize()); @@ -567,7 +567,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 constexpr auto c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl = GetCBlockDescriptor_MBlock_NXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(); - auto c_shuffle_block_buf = make_dynamic_buffer( + auto c_shuffle_block_buf = make_dynamic_buffer( static_cast(p_shared), c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl .GetElementSpaceSize()); @@ -644,7 +644,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 Sequence<0, 1, 2, 3, 4, 5, 6, 7>, 7, 1, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, 1, true>{ c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp index 588c16d01b..6d1d64eb15 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp @@ -68,7 +68,7 @@ template < typename FloatAB, typename FloatAcc, typename FloatC, - InMemoryDataOperationEnum_t CGlobalMemoryDataOperation, + InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, @@ -382,15 +382,15 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2 const CElementwiseOperation& c_element_op, const Block2CTileMap& block_2_ctile_map) { - const auto a_grid_buf = make_dynamic_buffer( + const auto a_grid_buf = make_dynamic_buffer( p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize()); - const auto b_grid_buf = make_dynamic_buffer( + const auto b_grid_buf = make_dynamic_buffer( p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize()); - auto c_grid_buf = make_dynamic_buffer( + auto c_grid_buf = make_dynamic_buffer( p_c_grid, c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl .GetElementSpaceSize()); - auto c0_grid_buf = make_dynamic_buffer( + auto c0_grid_buf = make_dynamic_buffer( p_c0_grid, c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl .GetElementSpaceSize()); @@ -422,7 +422,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2 BlockwiseTensorSliceTransfer_v4r1, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, @@ -453,7 +453,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2 BlockwiseTensorSliceTransfer_v4r1, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, @@ -505,10 +505,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2 constexpr auto a_block_space_size_aligned = math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align); - auto a_block_buf = make_dynamic_buffer( + auto a_block_buf = make_dynamic_buffer( static_cast(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize()); - auto b_block_buf = make_dynamic_buffer( + auto b_block_buf = make_dynamic_buffer( static_cast(p_shared) + a_block_space_size_aligned, b_block_desc_k0_n_k1.GetElementSpaceSize()); @@ -582,7 +582,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2 constexpr auto c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl = GetCBlockDescriptor_MBlock_NXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(); - auto c_block_buf = make_dynamic_buffer( + auto c_block_buf = make_dynamic_buffer( static_cast(p_shared), c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl .GetElementSpaceSize()); @@ -661,7 +661,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2 Sequence<0, 1, 2, 3, 4, 5, 6, 7>, 7, 1, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, 1, true>{ c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp index 3f8b74f544..da1b9bc6f1 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp @@ -74,7 +74,7 @@ template < typename FloatAB, typename FloatAcc, typename FloatC, - InMemoryDataOperationEnum_t CGlobalMemoryDataOperation, + InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, @@ -397,19 +397,19 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3 const CElementwiseOperation& c_element_op, const Block2CTileMap& block_2_ctile_map) { - const auto a_grid_buf = make_dynamic_buffer( + const auto a_grid_buf = make_dynamic_buffer( p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize()); - const auto b_grid_buf = make_dynamic_buffer( + const auto b_grid_buf = make_dynamic_buffer( p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize()); - auto c_grid_buf = make_dynamic_buffer( + auto c_grid_buf = make_dynamic_buffer( p_c_grid, c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl .GetElementSpaceSize()); - auto c0_grid_buf = make_dynamic_buffer( + auto c0_grid_buf = make_dynamic_buffer( p_c0_grid, c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl .GetElementSpaceSize()); - auto c1_grid_buf = make_dynamic_buffer( + auto c1_grid_buf = make_dynamic_buffer( p_c1_grid, c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl .GetElementSpaceSize()); @@ -441,7 +441,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3 BlockwiseTensorSliceTransfer_v4r1, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, @@ -471,7 +471,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3 BlockwiseTensorSliceTransfer_v4r1, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, @@ -522,10 +522,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3 constexpr auto a_block_space_size_aligned = math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align); - auto a_block_buf = make_dynamic_buffer( + auto a_block_buf = make_dynamic_buffer( static_cast(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize()); - auto b_block_buf = make_dynamic_buffer( + auto b_block_buf = make_dynamic_buffer( static_cast(p_shared) + a_block_space_size_aligned, b_block_desc_k0_n_k1.GetElementSpaceSize()); @@ -599,7 +599,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3 constexpr auto c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl = GetCBlockDescriptor_MBlock_NXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(); - auto c_block_buf = make_dynamic_buffer( + auto c_block_buf = make_dynamic_buffer( static_cast(p_shared), c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl .GetElementSpaceSize()); @@ -678,7 +678,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3 Sequence<0, 1, 2, 3, 4, 5, 6, 7>, 7, 1, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, 1, true>{ c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp index 5293049024..2b50852f43 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp @@ -45,13 +45,13 @@ __global__ void kernel_buffer_set_value(const Grid1dBufferDescType grid_1d_buffe const index_t thread_global_id = block_global_id * BlockSize + thread_local_id; - StaticBuffer value_buf; + StaticBuffer value_buf; value_buf(I0) = value; constexpr auto val_buff_desc = make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - auto global_buf = make_dynamic_buffer( + auto global_buf = make_dynamic_buffer( p_global, grid_1d_buffer_desc.GetElementSpaceSize()); if(thread_global_id < grid_1d_buffer_desc.GetElementSize()) @@ -65,7 +65,7 @@ __global__ void kernel_buffer_set_value(const Grid1dBufferDescType grid_1d_buffe Sequence<0>, 0, 1, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, 1, true>( grid_1d_buffer_desc, make_multi_index(thread_global_id), PassThroughOp{}); diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index 2ce64a9840..6521913541 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -56,7 +56,7 @@ template ::type = false> @@ -407,7 +407,7 @@ struct ThreadwiseTensorSliceTransfer_v2 // 3. src_slice_origin and dst_slice_origin are not known at compile-time, // 4. Use thread buffer template buffer_; + StaticBuffer buffer_; SrcCoord src_coord_; DstCoord dst_coord_; diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v1r4.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v1r4.hpp deleted file mode 100644 index 1ef098f6d5..0000000000 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v1r4.hpp +++ /dev/null @@ -1,523 +0,0 @@ -#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V1R4_HPP -#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V1R4_HPP - -#include "common_header.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" - -namespace ck { - -// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory -// and sometimes useless instructions: -// 1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument -// instead -// 2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same -// tensor coordinate instead -// 3. Don't use a pointer to VGPR buffer, use vector instead - -// WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1 -// TODO: fix this -// Assume: -// 1. src: -// 1. SrcDesc is known at compile-time -// 2. SrcBuffer is StaticBuffer -// 3. SrcSliceOrginIdx is known at compile-time -// 2. dst: -// 1. DstDesc is not known at compile-time -// 2. DstBuffer is DynamicBuffer -// 3. DstSliceOrginIdx is not known at compile time -template ::type = false> -struct ThreadwiseTensorSliceTransfer_v1r4 -{ - static constexpr index_t nDim = SliceLengths::Size(); - - using Index = MultiIndex; - - using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); - using Dst0Coord = decltype(make_tensor_coordinate(Dst0Desc{}, Index{})); - using Dst1Coord = decltype(make_tensor_coordinate(Dst1Desc{}, Index{})); - - using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{})); - using Dst0CoordStep = decltype(make_tensor_coordinate_step(Dst0Desc{}, Index{})); - using Dst1CoordStep = decltype(make_tensor_coordinate_step(Dst1Desc{}, Index{})); - - __device__ constexpr ThreadwiseTensorSliceTransfer_v1r4( - const DstDesc& dst_desc, - const Dst0Desc& dst0_desc, - const Dst1Desc& dst1_desc, - const Index& dst_slice_origin_idx, - const DstElementwiseOperation& dst_element_op) - : dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin_idx)), - dst0_coord_(make_tensor_coordinate(dst0_desc, dst_slice_origin_idx)), - dst1_coord_(make_tensor_coordinate(dst1_desc, dst_slice_origin_idx)), - dst_element_op_{dst_element_op} - { - static_assert(SrcDesc::IsKnownAtCompileTime(), - "wrong! SrcDesc need to known at compile-time"); - } - - __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx) - { - dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx); - } - - template - __device__ void Run(const SrcDesc&, - const SrcSliceOriginIdx&, - const SrcBuffer& src_buf, - const DstDesc& dst_desc, - DstBuffer& dst_buf, - const DstStepHacks& dst_step_hacks, - const Dst0Desc& dst0_desc, - const Dst0Buffer& dst0_buf, - const Dst0StepHacks& dst0_step_hacks, - const Dst1Desc& dst1_desc, - const Dst1Buffer& dst1_buf, - const Dst1StepHacks& dst1_step_hacks) - { - static_assert(SrcDesc::IsKnownAtCompileTime(), - "wrong! SrcDesc need to known at compile-time"); - - static_assert(is_known_at_compile_time>::value, - "wrong! SrcSliceOrigin need to known at compile-time"); - - static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer"); - - // SrcDesc and src_slice_origin_idx are known at compile-time - constexpr auto src_desc = remove_cvref_t{}; - constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{}); - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - - // scalar per access on each dim - // TODO: don't use lambda_scalar_per_access - constexpr auto dst_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); - - constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access; - - constexpr auto dim_access_order = DimAccessOrder{}; - - constexpr auto ordered_access_lengths = - container_reorder_given_new2old(access_lengths, dim_access_order); - - // make forward steps: dst - const auto dst_forward_steps = generate_tuple( - [&](auto i) { - Index forward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - dst_desc, forward_step_idx, dst_step_hacks[I0][i]); - }, - Number{}); - - // make forward steps: dst0 - // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same - // DstScalarPerVector - // TODO: fix this - const auto dst0_forward_steps = generate_tuple( - [&](auto i) { - Index forward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - dst0_desc, forward_step_idx, dst0_step_hacks[I0][i]); - }, - Number{}); - - // make forward steps: dst1 - // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same - // DstScalarPerVector - // TODO: fix this - const auto dst1_forward_steps = generate_tuple( - [&](auto i) { - Index forward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - dst1_desc, forward_step_idx, dst1_step_hacks[I0][i]); - }, - Number{}); - - // make backward steps: dst - const auto dst_backward_steps = generate_tuple( - [&](auto i) { - Index backward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - dst_desc, backward_step_idx, dst_step_hacks[I1][i]); - }, - Number{}); - - // make backward steps: dst0 - // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same - // DstScalarPerVector - // TODO: fix this - const auto dst0_backward_steps = generate_tuple( - [&](auto i) { - Index backward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - dst0_desc, backward_step_idx, dst0_step_hacks[I1][i]); - }, - Number{}); - - // make backward steps: dst1 - // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same - // DstScalarPerVector - // TODO: fix this - const auto dst1_backward_steps = generate_tuple( - [&](auto i) { - Index backward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - dst1_desc, backward_step_idx, dst1_step_hacks[I1][i]); - }, - Number{}); - - // loop over tensor and copy - static_ford{}([&](auto ordered_access_idx) { - // judge move forward or move backward - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_access_idx[I0]; - - static_for<1, i, 1>{}([&](auto j) { - tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j]; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate dst data index - constexpr auto dst_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] - ? ordered_access_idx[i] - : ordered_access_lengths[i] - 1 - ordered_access_idx[i]; - }); - - return container_reorder_given_old2new(ordered_idx, dim_access_order) * - dst_scalar_per_access; - }(); - - typename vector_type_maker::type dst_vector; - - using dst_vector_t = - typename vector_type_maker::type::type; - - // load dst0 and dst1 and apply elementwise operation - { - // WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1 - // TODO: fix this - static_assert(DstScalarPerVector == 1, "wrong!"); - - // copy data from src_buf into dst_vector_src_data - constexpr index_t src_offset = - src_desc.CalculateOffset(src_slice_origin_idx + dst_data_idx); - - const SrcData src_v = src_buf[Number{}]; - - // load dst0 and dst1 - const bool is_dst0_valid = - coordinate_has_valid_offset_assuming_visible_index_is_valid(dst0_desc, - dst0_coord_); - const bool is_dst1_valid = - coordinate_has_valid_offset_assuming_visible_index_is_valid(dst1_desc, - dst1_coord_); - - const DstData dst0_v = - dst0_buf.template Get(dst0_coord_.GetOffset(), is_dst0_valid); - const DstData dst1_v = - dst1_buf.template Get(dst1_coord_.GetOffset(), is_dst1_valid); - -#if !CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R4_TYPE_CONVERT_ISSUE - // apply element-wise operation in SrcData type - const SrcData dst_v = dst_element_op_( - src_v, type_convert(dst0_v), type_convert(dst1_v)); - - // apply type convert - dst_vector.template AsType()(Number<0>{}) = type_convert(dst_v); -#else - // apply element-wise operation in DstData type - DstData dst_v; - - dst_element_op_(dst_v, src_v, dst0_v, dst1_v); - - dst_vector.template AsType()(Number<0>{}) = dst_v; -#endif - } - - const bool is_dst_valid = - coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); - - // copy data from dst_vector into dst_buf - if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set) - { - dst_buf.template Set( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector.template AsType()[Number<0>{}]); - } - else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd) - { - dst_buf.template AtomicAdd( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector.template AsType()[Number<0>{}]); - } - else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Add) - { - - typename vector_type_maker::type tmp; - tmp.template AsType()(Number<0>{}) = - dst_buf.template Get(dst_coord_.GetOffset(), is_dst_valid); - - static_for<0, DstScalarPerVector, 1>{}([&](auto t) { - dst_vector.template AsType()(t) += tmp.template AsType()[t]; - }); - - dst_buf.template Set( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector.template AsType()[Number<0>{}]); - } - - constexpr auto move_on_dim = [&]() constexpr - { - StaticallyIndexedArray move_on_dim_; - - static_for<0, nDim, 1>{}([&](auto i) { - move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1; - - static_for{}([&](auto j) { - move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1; - }); - }); - - return move_on_dim_; - } - (); - - // move - static_for<0, nDim, 1>{}([&](auto i) { - if constexpr(move_on_dim[i]) - { - if constexpr(forward_sweep[i]) - { - move_tensor_coordinate( - dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]); - - // dst0 - move_tensor_coordinate( - dst0_desc, dst0_coord_, dst0_forward_steps[dim_access_order[i]]); - - // dst1 - move_tensor_coordinate( - dst1_desc, dst1_coord_, dst1_forward_steps[dim_access_order[i]]); - } - else - { - move_tensor_coordinate( - dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]); - - // dst0 - move_tensor_coordinate( - dst0_desc, dst0_coord_, dst0_backward_steps[dim_access_order[i]]); - - // dst1 - move_tensor_coordinate( - dst1_desc, dst1_coord_, dst1_backward_steps[dim_access_order[i]]); - } - } - }); - }); - - // move dst coordinate back to slice origin (or not) - if constexpr(DstResetCoordinateAfterRun) - { - const auto dst_reset_step = - make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep()); - - move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step); - } - } - - template - __device__ void Run(const SrcDesc&, - const SrcSliceOriginIdx&, - const SrcBuffer& src_buf, - const DstDesc& dst_desc, - DstBuffer& dst_buf, - const Dst0Desc& dst0_desc, - const Dst0Buffer& dst0_buf, - const Dst1Desc& dst1_desc, - const Dst1Buffer& dst1_buf) - { - auto f_step_hacks = [&](auto desc) { - constexpr index_t ntransform = decltype(desc)::GetNumOfTransform(); - - constexpr auto zeros = typename uniform_sequence_gen::type{}; - - constexpr auto step_hacks = - make_tuple(generate_tuple([&](auto) { return zeros; }, Number{}), - generate_tuple([&](auto) { return zeros; }, Number{})); - - return step_hacks; - }; - - Run(SrcDesc{}, - SrcSliceOriginIdx{}, - src_buf, - dst_desc, - dst_buf, - f_step_hacks(dst_desc), - dst0_desc, - dst0_buf, - f_step_hacks(dst0_desc), - dst1_desc, - dst1_buf, - f_step_hacks(dst1_desc)); - } - - __device__ static constexpr auto GetDstCoordinateResetStep() - { - constexpr auto I0 = Number<0>{}; - - // scalar per access on each dim - // TODO: don't use lambda_scalar_per_access - constexpr auto dst_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); - - constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access; - - constexpr auto dim_access_order = DimAccessOrder{}; - - constexpr auto ordered_access_lengths = - container_reorder_given_new2old(access_lengths, dim_access_order); - - // judge move forward or move backward during the last iteration - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_access_lengths[I0] - 1; - - static_for<1, i, 1>{}([&](auto j) { - tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate dst data index after last iteration in Run(), if it has not being reset by - // RunWrite() - constexpr auto dst_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0; - }); - - return container_reorder_given_old2new(ordered_idx, dim_access_order) * - dst_scalar_per_access; - }(); - - // - constexpr auto reset_dst_data_step = [&]() { - Index reset_dst_data_step_; - - static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; }); - - return reset_dst_data_step_; - }(); - - return reset_dst_data_step; - } - - // dst_slice_origin_step_idx need to be known at compile-time, for performance reason - __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, - const Index& dst_slice_origin_step_idx) - { - // if dst coord was not reset by Run(), then need to adjust the step here - const auto adjusted_step_idx = - DstResetCoordinateAfterRun ? dst_slice_origin_step_idx - : dst_slice_origin_step_idx + GetDstCoordinateResetStep(); - - // is it OK to construct a new step every time? - const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx); - - move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step); - } - - private: - DstCoord dst_coord_; - Dst0Coord dst0_coord_; - Dst1Coord dst1_coord_; - const DstElementwiseOperation dst_element_op_; -}; // namespace ck - -} // namespace ck -#endif diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v1r5.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v1r5.hpp deleted file mode 100644 index 6389680c5f..0000000000 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v1r5.hpp +++ /dev/null @@ -1,453 +0,0 @@ -#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V1R5_HPP -#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V1R5_HPP - -#include "common_header.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" - -namespace ck { - -// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory -// and sometimes useless instructions: -// 1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument -// instead -// 2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same -// tensor coordinate instead -// 3. Don't use a pointer to VGPR buffer, use vector instead - -// WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1 -// TODO: fix this -// Assume: -// 1. src: -// 1. SrcDesc is known at compile-time -// 2. SrcBuffer is StaticBuffer -// 3. SrcSliceOrginIdx is known at compile-time -// 2. dst: -// 1. DstDesc is not known at compile-time -// 2. DstBuffer is DynamicBuffer -// 3. DstSliceOrginIdx is not known at compile time -template ::type = false> -struct ThreadwiseTensorSliceTransfer_v1r5 -{ - static constexpr index_t nDim = SliceLengths::Size(); - - using Index = MultiIndex; - - using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); - using Dst0Coord = decltype(make_tensor_coordinate(Dst0Desc{}, Index{})); - - using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{})); - using Dst0CoordStep = decltype(make_tensor_coordinate_step(Dst0Desc{}, Index{})); - - __device__ constexpr ThreadwiseTensorSliceTransfer_v1r5( - const DstDesc& dst_desc, - const Dst0Desc& dst0_desc, - const Index& dst_slice_origin_idx, - const DstElementwiseOperation& dst_element_op) - : dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin_idx)), - dst0_coord_(make_tensor_coordinate(dst0_desc, dst_slice_origin_idx)), - dst_element_op_{dst_element_op} - { - static_assert(SrcDesc::IsKnownAtCompileTime(), - "wrong! SrcDesc need to known at compile-time"); - } - - __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx) - { - dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx); - } - - template - __device__ void Run(const SrcDesc&, - const SrcSliceOriginIdx&, - const SrcBuffer& src_buf, - const DstDesc& dst_desc, - DstBuffer& dst_buf, - const DstStepHacks& dst_step_hacks, - const Dst0Desc& dst0_desc, - const Dst0Buffer& dst0_buf, - const Dst0StepHacks& dst0_step_hacks) - { - static_assert(SrcDesc::IsKnownAtCompileTime(), - "wrong! SrcDesc need to known at compile-time"); - - static_assert(is_known_at_compile_time>::value, - "wrong! SrcSliceOrigin need to known at compile-time"); - - static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer"); - - // SrcDesc and src_slice_origin_idx are known at compile-time - constexpr auto src_desc = remove_cvref_t{}; - constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{}); - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - - // scalar per access on each dim - // TODO: don't use lambda_scalar_per_access - constexpr auto dst_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); - - constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access; - - constexpr auto dim_access_order = DimAccessOrder{}; - - constexpr auto ordered_access_lengths = - container_reorder_given_new2old(access_lengths, dim_access_order); - - // make forward steps: dst - const auto dst_forward_steps = generate_tuple( - [&](auto i) { - Index forward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - dst_desc, forward_step_idx, dst_step_hacks[I0][i]); - }, - Number{}); - - // make forward steps: dst0 - // WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1 - // TODO: fix this - const auto dst0_forward_steps = generate_tuple( - [&](auto i) { - Index forward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - dst0_desc, forward_step_idx, dst0_step_hacks[I0][i]); - }, - Number{}); - - // make backward steps: dst - const auto dst_backward_steps = generate_tuple( - [&](auto i) { - Index backward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - dst_desc, backward_step_idx, dst_step_hacks[I1][i]); - }, - Number{}); - - // make backward steps: dst0 - // WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1 - // TODO: fix this - const auto dst0_backward_steps = generate_tuple( - [&](auto i) { - Index backward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - dst0_desc, backward_step_idx, dst0_step_hacks[I1][i]); - }, - Number{}); - - // loop over tensor and copy - static_ford{}([&](auto ordered_access_idx) { - // judge move forward or move backward - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_access_idx[I0]; - - static_for<1, i, 1>{}([&](auto j) { - tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j]; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate dst data index - constexpr auto dst_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] - ? ordered_access_idx[i] - : ordered_access_lengths[i] - 1 - ordered_access_idx[i]; - }); - - return container_reorder_given_old2new(ordered_idx, dim_access_order) * - dst_scalar_per_access; - }(); - - typename vector_type_maker::type dst_vector; - - using dst_vector_t = - typename vector_type_maker::type::type; - - // load dst0 and apply elementwise operation - { - // WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1 - // TODO: fix this - static_assert(DstScalarPerVector == 1, "wrong!"); - - // copy data from src_buf into dst_vector_src_data - constexpr index_t src_offset = - src_desc.CalculateOffset(src_slice_origin_idx + dst_data_idx); - - const SrcData src_v = src_buf[Number{}]; - - // load dst0 - const bool is_dst0_valid = - coordinate_has_valid_offset_assuming_visible_index_is_valid(dst0_desc, - dst0_coord_); - const DstData dst0_v = - dst0_buf.template Get(dst0_coord_.GetOffset(), is_dst0_valid); - -#if !CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R5_TYPE_CONVERT_ISSUE - // apply element-wise operation in SrcData type - const SrcData dst_v = dst_element_op_(src_v, type_convert(dst0_v)); - - // apply type convert - dst_vector.template AsType()(Number<0>{}) = type_convert(dst_v); -#else - // apply element-wise operation in DstData type - const DstData dst_v = dst_element_op_(src_v, dst0_v); - - dst_vector.template AsType()(Number<0>{}) = dst_v; -#endif - } - - const bool is_dst_valid = - coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); - - // copy data from dst_vector into dst_buf - if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set) - { - dst_buf.template Set( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector.template AsType()[Number<0>{}]); - } - else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd) - { - dst_buf.template AtomicAdd( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector.template AsType()[Number<0>{}]); - } - else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Add) - { - - typename vector_type_maker::type tmp; - tmp.template AsType()(Number<0>{}) = - dst_buf.template Get(dst_coord_.GetOffset(), is_dst_valid); - - static_for<0, DstScalarPerVector, 1>{}([&](auto t) { - dst_vector.template AsType()(t) += tmp.template AsType()[t]; - }); - - dst_buf.template Set( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector.template AsType()[Number<0>{}]); - } - - constexpr auto move_on_dim = [&]() constexpr - { - StaticallyIndexedArray move_on_dim_; - - static_for<0, nDim, 1>{}([&](auto i) { - move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1; - - static_for{}([&](auto j) { - move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1; - }); - }); - - return move_on_dim_; - } - (); - - // move - static_for<0, nDim, 1>{}([&](auto i) { - if constexpr(move_on_dim[i]) - { - if constexpr(forward_sweep[i]) - { - move_tensor_coordinate( - dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]); - - // dst0 - move_tensor_coordinate( - dst0_desc, dst0_coord_, dst0_forward_steps[dim_access_order[i]]); - } - else - { - move_tensor_coordinate( - dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]); - - // dst0 - move_tensor_coordinate( - dst0_desc, dst0_coord_, dst0_backward_steps[dim_access_order[i]]); - } - } - }); - }); - - // move dst coordinate back to slice origin (or not) - if constexpr(DstResetCoordinateAfterRun) - { - const auto dst_reset_step = - make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep()); - - move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step); - } - } - - template - __device__ void Run(const SrcDesc&, - const SrcSliceOriginIdx&, - const SrcBuffer& src_buf, - const DstDesc& dst_desc, - DstBuffer& dst_buf, - const Dst0Desc& dst0_desc, - const Dst0Buffer& dst0_buf) - { - auto f_step_hacks = [&](auto desc) { - constexpr index_t ntransform = decltype(desc)::GetNumOfTransform(); - - constexpr auto zeros = typename uniform_sequence_gen::type{}; - - constexpr auto step_hacks = - make_tuple(generate_tuple([&](auto) { return zeros; }, Number{}), - generate_tuple([&](auto) { return zeros; }, Number{})); - - return step_hacks; - }; - - Run(SrcDesc{}, - SrcSliceOriginIdx{}, - src_buf, - dst_desc, - dst_buf, - f_step_hacks(dst_desc), - dst0_desc, - dst0_buf, - f_step_hacks(dst0_desc)); - } - - __device__ static constexpr auto GetDstCoordinateResetStep() - { - constexpr auto I0 = Number<0>{}; - - // scalar per access on each dim - // TODO: don't use lambda_scalar_per_access - constexpr auto dst_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); - - constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access; - - constexpr auto dim_access_order = DimAccessOrder{}; - - constexpr auto ordered_access_lengths = - container_reorder_given_new2old(access_lengths, dim_access_order); - - // judge move forward or move backward during the last iteration - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_access_lengths[I0] - 1; - - static_for<1, i, 1>{}([&](auto j) { - tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate dst data index after last iteration in Run(), if it has not being reset by - // RunWrite() - constexpr auto dst_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0; - }); - - return container_reorder_given_old2new(ordered_idx, dim_access_order) * - dst_scalar_per_access; - }(); - - // - constexpr auto reset_dst_data_step = [&]() { - Index reset_dst_data_step_; - - static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; }); - - return reset_dst_data_step_; - }(); - - return reset_dst_data_step; - } - - // dst_slice_origin_step_idx need to be known at compile-time, for performance reason - __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, - const Index& dst_slice_origin_step_idx) - { - // if dst coord was not reset by Run(), then need to adjust the step here - const auto adjusted_step_idx = - DstResetCoordinateAfterRun ? dst_slice_origin_step_idx - : dst_slice_origin_step_idx + GetDstCoordinateResetStep(); - - // is it OK to construct a new step every time? - const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx); - - move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step); - } - - private: - DstCoord dst_coord_; - Dst0Coord dst0_coord_; - const DstElementwiseOperation dst_element_op_; -}; // namespace ck - -} // namespace ck -#endif diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp index b20b391196..dbe057e20d 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp @@ -48,7 +48,7 @@ struct lambda_scalar_per_access_for_src_and_dst template thread_scratch_id = Number{}) { - static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or - SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds, + static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or + SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds, "wrong!"); static_assert( @@ -271,7 +271,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 static_ford{}([&](auto idx) { // convert from SrcData to DstData here dst_thread_scratch_(idx) = - type_convert(src_thread_scratch_tuple[thread_scratch_id][idx]); + type_convert(src_thread_scratch_tuple_[thread_scratch_id][idx]); }); #else // sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_ @@ -361,8 +361,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1 // TODO move this elsewhere TransferDataFromSrcThreadScratchToDstThreadScratch(thread_scratch_id); - static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or - DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds, + static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or + DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds, "wrong!"); static_assert( @@ -763,13 +763,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1 static constexpr auto src_thread_scratch_desc_ = decltype(GetSrcThreadScratchDescriptor()){}; static constexpr auto dst_thread_scratch_desc_ = decltype(GetDstThreadScratchDescriptor()){}; - using SrcThreadScratch = StaticTensorTupleOfVectorBuffer; - using DstThreadScratch = StaticTensorTupleOfVectorBuffer __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf) { - static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or - SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds, + static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or + SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds, "wrong!"); static_assert( @@ -369,8 +369,8 @@ struct ThreadwiseTensorSliceTransfer_v3r3 // TODO move this elsewhere TransferDataFromSrcThreadScratchToDstThreadScratch(); - static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or - DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds, + static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or + DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds, "wrong!"); static_assert( @@ -859,14 +859,14 @@ struct ThreadwiseTensorSliceTransfer_v3r3 static constexpr auto src_thread_scratch_desc_ = decltype(GetSrcThreadScratchDescriptor()){}; static constexpr auto dst_thread_scratch_desc_ = decltype(GetDstThreadScratchDescriptor()){}; - StaticTensorTupleOfVectorBuffer src_thread_scratch_; - StaticTensorTupleOfVectorBuffer buffer_; + StaticBuffer buffer_; SrcCoord src_coord_; DstCoord dst_coord_; diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp index b180f7f432..c6360d3b29 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp @@ -29,7 +29,7 @@ template struct ThreadwiseTensorSliceTransfer_v6r1 diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp index 67a2bc9bb2..ae85ba91e5 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp @@ -31,7 +31,7 @@ template diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp index fd3a5151fb..47024d5e68 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp @@ -33,7 +33,7 @@ template static constexpr auto GetMfma() { -#if defined(CK_AMD_GPU_GFX90A) +#if defined(CK_USE_AMD_MFMA_BF16_1K_OP) return MfmaInstr::mfma_f32_32x32x8bf16_1k; #else return MfmaInstr::mfma_f32_32x32x4bf16; @@ -486,7 +486,7 @@ struct MfmaSelector template <> static constexpr auto GetMfma() { -#if defined(CK_AMD_GPU_GFX90A) +#if defined(CK_USE_AMD_MFMA_BF16_1K_OP) return MfmaInstr::mfma_f32_16x16x16bf16_1k; #else return MfmaInstr::mfma_f32_16x16x8bf16; diff --git a/include/ck/utility/amd_address_space.hpp b/include/ck/utility/amd_address_space.hpp index 797fb7ab2f..3c5939aaf3 100644 --- a/include/ck/utility/amd_address_space.hpp +++ b/include/ck/utility/amd_address_space.hpp @@ -9,7 +9,7 @@ namespace ck { -enum struct AddressSpaceEnum_t +enum struct AddressSpaceEnum { Generic, Global, @@ -19,7 +19,7 @@ enum struct AddressSpaceEnum_t }; template -__device__ T* cast_pointer_to_generic_address_space(T CONSTANT* p) +__device__ T* cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE* p) { // cast a pointer in "Constant" address space (4) to "Generic" address space (0) // only c-style pointer cast seems be able to be compiled @@ -30,13 +30,13 @@ __device__ T* cast_pointer_to_generic_address_space(T CONSTANT* p) } template -__host__ __device__ T CONSTANT* cast_pointer_to_constant_address_space(T* p) +__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_address_space(T* p) { // cast a pointer in "Generic" address space (0) to "Constant" address space (4) // only c-style pointer cast seems be able to be compiled #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wold-style-cast" - return (T CONSTANT*)p; // NOLINT(old-style-cast) + return (T CK_CONSTANT_ADDRESS_SPACE*)p; // NOLINT(old-style-cast) #pragma clang diagnostic pop } diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp index c8fb9cb1a3..53c24b9a98 100644 --- a/include/ck/utility/amd_buffer_addressing.hpp +++ b/include/ck/utility/amd_buffer_addressing.hpp @@ -1,6 +1,4 @@ -#ifndef CK_AMD_BUFFER_ADDRESSING_HPP -#define CK_AMD_BUFFER_ADDRESSING_HPP - +#pragma once #include "data_type.hpp" namespace ck { @@ -87,6 +85,7 @@ llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32"); + // buffer load fp16 __device__ half_t llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc, @@ -212,6 +211,7 @@ llvm_amdgcn_raw_buffer_store_fp16x4(half4_t vdata, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16"); + // buffer store fp32 __device__ void llvm_amdgcn_raw_buffer_store_fp32(float vdata, @@ -233,6 +233,7 @@ llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32"); + // buffer atomic-add fp16 __device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2( half2_t vdata, @@ -1046,4 +1047,3 @@ amd_buffer_atomic_add(const typename vector_type_maker::type::type src_thr } } // namespace ck -#endif diff --git a/include/ck/utility/common_header.hpp b/include/ck/utility/common_header.hpp index 494cbb383d..45f387ef2a 100644 --- a/include/ck/utility/common_header.hpp +++ b/include/ck/utility/common_header.hpp @@ -1,6 +1,4 @@ -#ifndef CK_COMMON_HEADER_HPP -#define CK_COMMON_HEADER_HPP - +#pragma once #include "config.hpp" #include "array.hpp" #include "container_helper.hpp" @@ -20,30 +18,29 @@ #include "number.hpp" #include "sequence.hpp" #include "sequence_helper.hpp" -#include "synchronization.hpp" #include "tuple.hpp" #include "tuple_helper.hpp" #include "type.hpp" #include "magic_division.hpp" -#include "utility.hpp" #include "c_style_pointer_cast.hpp" -#include "amd_address_space.hpp" -#include "amd_buffer_addressing.hpp" -#include "static_buffer.hpp" -#include "dynamic_buffer.hpp" #include "is_known_at_compile_time.hpp" #include "transpose_vectors.hpp" #include "inner_product.hpp" #include "element_wise_operation.hpp" #include "debug.hpp" +#include "amd_buffer_addressing.hpp" +#include "get_id.hpp" +#include "synchronization.hpp" +#include "amd_address_space.hpp" +#include "static_buffer.hpp" +#include "dynamic_buffer.hpp" + // TODO: remove this #if CK_USE_AMD_INLINE_ASM #include "amd_inline_asm.hpp" #endif -#if CK_USE_AMD_XDLOPS +#ifdef CK_USE_AMD_MFMA #include "amd_xdlops.hpp" #endif - -#endif diff --git a/include/ck/utility/data_type_enum.hpp b/include/ck/utility/data_type_enum.hpp index 7c60e0fe39..fda6a2b05c 100644 --- a/include/ck/utility/data_type_enum.hpp +++ b/include/ck/utility/data_type_enum.hpp @@ -3,7 +3,7 @@ namespace ck { -enum struct DataTypeEnum_t +enum struct DataTypeEnum { Half = 0, Float = 1, diff --git a/include/ck/utility/data_type_enum_helper.hpp b/include/ck/utility/data_type_enum_helper.hpp index 451ce992b1..9c8e01a7e3 100644 --- a/include/ck/utility/data_type_enum_helper.hpp +++ b/include/ck/utility/data_type_enum_helper.hpp @@ -6,35 +6,35 @@ namespace ck { -template +template struct get_datatype_from_enum; template <> -struct get_datatype_from_enum +struct get_datatype_from_enum { using type = int8_t; }; template <> -struct get_datatype_from_enum +struct get_datatype_from_enum { using type = int32_t; }; template <> -struct get_datatype_from_enum +struct get_datatype_from_enum { using type = half_t; }; template <> -struct get_datatype_from_enum +struct get_datatype_from_enum { using type = float; }; template <> -struct get_datatype_from_enum +struct get_datatype_from_enum { using type = double; }; @@ -45,31 +45,31 @@ struct get_datatype_enum_from_type; template <> struct get_datatype_enum_from_type { - static constexpr DataTypeEnum_t value = DataTypeEnum_t::Int8; + static constexpr DataTypeEnum value = DataTypeEnum::Int8; }; template <> struct get_datatype_enum_from_type { - static constexpr DataTypeEnum_t value = DataTypeEnum_t::Int32; + static constexpr DataTypeEnum value = DataTypeEnum::Int32; }; template <> struct get_datatype_enum_from_type { - static constexpr DataTypeEnum_t value = DataTypeEnum_t::Half; + static constexpr DataTypeEnum value = DataTypeEnum::Half; }; template <> struct get_datatype_enum_from_type { - static constexpr DataTypeEnum_t value = DataTypeEnum_t::Float; + static constexpr DataTypeEnum value = DataTypeEnum::Float; }; template <> struct get_datatype_enum_from_type { - static constexpr DataTypeEnum_t value = DataTypeEnum_t::Double; + static constexpr DataTypeEnum value = DataTypeEnum::Double; }; } // namespace ck diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp index d9193ce65f..3c8e5010a2 100644 --- a/include/ck/utility/dynamic_buffer.hpp +++ b/include/ck/utility/dynamic_buffer.hpp @@ -1,6 +1,4 @@ -#ifndef CK_BUFFER_HPP -#define CK_BUFFER_HPP - +#pragma once #include "amd_buffer_addressing.hpp" #include "c_style_pointer_cast.hpp" #include "config.hpp" @@ -8,7 +6,7 @@ namespace ck { -template @@ -34,7 +32,7 @@ struct DynamicBuffer { } - __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace() + __host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace() { return BufferAddressSpace; } @@ -55,7 +53,7 @@ struct DynamicBuffer constexpr index_t scalar_per_x_vector = scalar_type>::vector_size; static_assert(scalar_per_x_vector % scalar_per_t_vector == 0, - "wrong! X need to be multiple T"); + "wrong! X should contain multiple T"); #if CK_USE_AMD_BUFFER_LOAD bool constexpr use_amd_buffer_addressing = true; @@ -63,7 +61,7 @@ struct DynamicBuffer bool constexpr use_amd_buffer_addressing = false; #endif - if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global && use_amd_buffer_addressing) + if constexpr(GetAddressSpace() == AddressSpaceEnum::Global && use_amd_buffer_addressing) { constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; @@ -81,50 +79,48 @@ struct DynamicBuffer } else { - if constexpr(InvalidElementUseNumericalZeroValue) + if(is_valid_element) { #if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS X tmp; __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X)); - return is_valid_element ? tmp : X{0}; + return tmp; #else - return is_valid_element ? *c_style_pointer_cast(&p_data_[i]) : X{0}; + return *c_style_pointer_cast(&p_data_[i]); #endif } else { -#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS - X tmp; - - __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X)); - - return is_valid_element ? tmp : X{invalid_element_value_}; -#else - return is_valid_element ? *c_style_pointer_cast(&p_data_[i]) - : X{invalid_element_value_}; -#endif + if constexpr(InvalidElementUseNumericalZeroValue) + { + return X{0}; + } + else + { + return X{invalid_element_value_}; + } } } } - template >::type, typename scalar_type>::type>::value, bool>::type = false> __host__ __device__ void Update(index_t i, bool is_valid_element, const X& x) { - if constexpr(Op == InMemoryDataOperationEnum_t::Set) + if constexpr(Op == InMemoryDataOperationEnum::Set) { this->template Set(i, is_valid_element, x); } - else if constexpr(Op == InMemoryDataOperationEnum_t::AtomicAdd) + else if constexpr(Op == InMemoryDataOperationEnum::AtomicAdd) { this->template AtomicAdd(i, is_valid_element, x); } - else if constexpr(Op == InMemoryDataOperationEnum_t::Add) + else if constexpr(Op == InMemoryDataOperationEnum::Add) { auto tmp = this->template Get(i, is_valid_element); this->template Set(i, is_valid_element, x + tmp); @@ -145,143 +141,120 @@ struct DynamicBuffer constexpr index_t scalar_per_x_vector = scalar_type>::vector_size; static_assert(scalar_per_x_vector % scalar_per_t_vector == 0, - "wrong! X need to be multiple T"); + "wrong! X should contain multiple T"); - if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global) - { #if CK_USE_AMD_BUFFER_STORE + bool constexpr use_amd_buffer_addressing = true; +#else + bool constexpr use_amd_buffer_addressing = false; +#endif + +#if CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE + bool constexpr workaround_int8_ds_write_issue = true; +#else + bool constexpr workaround_int8_ds_write_issue = false; +#endif + + if constexpr(GetAddressSpace() == AddressSpaceEnum::Global && use_amd_buffer_addressing) + { constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; amd_buffer_store, t_per_x>( x, p_data_, i, is_valid_element, element_space_size_); -#else - if(is_valid_element) - { -#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS - X tmp = x; - - __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X)); -#else - *c_style_pointer_cast(&p_data_[i]) = x; -#endif - } -#endif } - else if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Lds) + else if constexpr(GetAddressSpace() == AddressSpaceEnum::Lds && + is_same>::type, int8_t>::value && + workaround_int8_ds_write_issue) { if(is_valid_element) { -#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE -#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS - X tmp = x; - - __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X)); -#else - *c_style_pointer_cast(&p_data_[i]) = x; -#endif -#else - // HACK: compiler would lower IR "store address_space(3)" into - // inefficient + // HACK: compiler would lower IR "store address_space(3)" into inefficient // ISA, so I try to let compiler emit IR "store" which would be lower to // ds_write_b128 // TODO: remove this after compiler fix - if constexpr(is_same>::type, int8_t>::value) - { - static_assert((is_same, int8_t>::value && - is_same, int8_t>::value) || - (is_same, int8_t>::value && - is_same, int8x2_t>::value) || - (is_same, int8_t>::value && - is_same, int8x4_t>::value) || - (is_same, int8_t>::value && - is_same, int8x8_t>::value) || - (is_same, int8_t>::value && - is_same, int8x16_t>::value) || - (is_same, int8x4_t>::value && - is_same, int8x4_t>::value) || - (is_same, int8x8_t>::value && - is_same, int8x8_t>::value) || - (is_same, int8x16_t>::value && - is_same, int8x16_t>::value), - "wrong! not implemented for this combination, please add " - "implementation"); + static_assert((is_same, int8_t>::value && + is_same, int8_t>::value) || + (is_same, int8_t>::value && + is_same, int8x2_t>::value) || + (is_same, int8_t>::value && + is_same, int8x4_t>::value) || + (is_same, int8_t>::value && + is_same, int8x8_t>::value) || + (is_same, int8_t>::value && + is_same, int8x16_t>::value) || + (is_same, int8x4_t>::value && + is_same, int8x4_t>::value) || + (is_same, int8x8_t>::value && + is_same, int8x8_t>::value) || + (is_same, int8x16_t>::value && + is_same, int8x16_t>::value), + "wrong! not implemented for this combination, please add " + "implementation"); - if constexpr(is_same, int8_t>::value && - is_same, int8_t>::value) - { - // HACK: cast pointer of x is bad - // TODO: remove this after compiler fix - *c_style_pointer_cast(&p_data_[i]) = - *c_style_pointer_cast(&x); - } - else if constexpr(is_same, int8_t>::value && - is_same, int8x2_t>::value) - { - // HACK: cast pointer of x is bad - // TODO: remove this after compiler fix - *c_style_pointer_cast(&p_data_[i]) = - *c_style_pointer_cast(&x); - } - else if constexpr(is_same, int8_t>::value && - is_same, int8x4_t>::value) - { - // HACK: cast pointer of x is bad - // TODO: remove this after compiler fix - *c_style_pointer_cast(&p_data_[i]) = - *c_style_pointer_cast(&x); - } - else if constexpr(is_same, int8_t>::value && - is_same, int8x8_t>::value) - { - // HACK: cast pointer of x is bad - // TODO: remove this after compiler fix - *c_style_pointer_cast(&p_data_[i]) = - *c_style_pointer_cast(&x); - } - else if constexpr(is_same, int8_t>::value && - is_same, int8x16_t>::value) - { - // HACK: cast pointer of x is bad - // TODO: remove this after compiler fix - *c_style_pointer_cast(&p_data_[i]) = - *c_style_pointer_cast(&x); - } - else if constexpr(is_same, int8x4_t>::value && - is_same, int8x4_t>::value) - { - // HACK: cast pointer of x is bad - // TODO: remove this after compiler fix - *c_style_pointer_cast(&p_data_[i]) = - *c_style_pointer_cast(&x); - } - else if constexpr(is_same, int8x8_t>::value && - is_same, int8x8_t>::value) - { - // HACK: cast pointer of x is bad - // TODO: remove this after compiler fix - *c_style_pointer_cast(&p_data_[i]) = - *c_style_pointer_cast(&x); - } - else if constexpr(is_same, int8x16_t>::value && - is_same, int8x16_t>::value) - { - // HACK: cast pointer of x is bad - // TODO: remove this after compiler fix - *c_style_pointer_cast(&p_data_[i]) = - *c_style_pointer_cast(&x); - } - } - else + if constexpr(is_same, int8_t>::value && + is_same, int8_t>::value) { -#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS - X tmp = x; - - __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X)); -#else - *c_style_pointer_cast(&p_data_[i]) = x; -#endif + // HACK: cast pointer of x is bad + // TODO: remove this after compiler fix + *c_style_pointer_cast(&p_data_[i]) = + *c_style_pointer_cast(&x); + } + else if constexpr(is_same, int8_t>::value && + is_same, int8x2_t>::value) + { + // HACK: cast pointer of x is bad + // TODO: remove this after compiler fix + *c_style_pointer_cast(&p_data_[i]) = + *c_style_pointer_cast(&x); + } + else if constexpr(is_same, int8_t>::value && + is_same, int8x4_t>::value) + { + // HACK: cast pointer of x is bad + // TODO: remove this after compiler fix + *c_style_pointer_cast(&p_data_[i]) = + *c_style_pointer_cast(&x); + } + else if constexpr(is_same, int8_t>::value && + is_same, int8x8_t>::value) + { + // HACK: cast pointer of x is bad + // TODO: remove this after compiler fix + *c_style_pointer_cast(&p_data_[i]) = + *c_style_pointer_cast(&x); + } + else if constexpr(is_same, int8_t>::value && + is_same, int8x16_t>::value) + { + // HACK: cast pointer of x is bad + // TODO: remove this after compiler fix + *c_style_pointer_cast(&p_data_[i]) = + *c_style_pointer_cast(&x); + } + else if constexpr(is_same, int8x4_t>::value && + is_same, int8x4_t>::value) + { + // HACK: cast pointer of x is bad + // TODO: remove this after compiler fix + *c_style_pointer_cast(&p_data_[i]) = + *c_style_pointer_cast(&x); + } + else if constexpr(is_same, int8x8_t>::value && + is_same, int8x8_t>::value) + { + // HACK: cast pointer of x is bad + // TODO: remove this after compiler fix + *c_style_pointer_cast(&p_data_[i]) = + *c_style_pointer_cast(&x); + } + else if constexpr(is_same, int8x16_t>::value && + is_same, int8x16_t>::value) + { + // HACK: cast pointer of x is bad + // TODO: remove this after compiler fix + *c_style_pointer_cast(&p_data_[i]) = + *c_style_pointer_cast(&x); } -#endif } } else @@ -305,27 +278,49 @@ struct DynamicBuffer bool>::type = false> __host__ __device__ void AtomicAdd(index_t i, bool is_valid_element, const X& x) { + using scalar_t = typename scalar_type>::type; + // X contains multiple T constexpr index_t scalar_per_t_vector = scalar_type>::vector_size; constexpr index_t scalar_per_x_vector = scalar_type>::vector_size; static_assert(scalar_per_x_vector % scalar_per_t_vector == 0, - "wrong! X need to be multiple T"); + "wrong! X should contain multiple T"); - static_assert(GetAddressSpace() == AddressSpaceEnum_t::Global, "only support global mem"); + static_assert(GetAddressSpace() == AddressSpaceEnum::Global, "only support global mem"); -#if CK_USE_AMD_BUFFER_ATOMIC_ADD - constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; - - amd_buffer_atomic_add, t_per_x>( - x, p_data_, i, is_valid_element, element_space_size_); +#if CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT + bool constexpr use_amd_buffer_addressing = + is_same_v, int32_t> || + is_same_v, float> || + (is_same_v, half_t> && scalar_per_x_vector % 2 == 0); +#elif CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT) + bool constexpr use_amd_buffer_addressing = is_same_v, int32_t>; +#elif(!CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT + bool constexpr use_amd_buffer_addressing = + is_same_v, float> || + (is_same_v, half_t> && scalar_per_x_vector % 2 == 0); #else - if(is_valid_element) - { - atomicAdd(&p_data_[i], x); - } + bool constexpr use_amd_buffer_addressing = false; #endif + + if constexpr(use_amd_buffer_addressing) + { + constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; + + amd_buffer_atomic_add, t_per_x>( + x, p_data_, i, is_valid_element, element_space_size_); + } + else + { + if(is_valid_element) + { + // FIXME: atomicAdd is defined by HIP, need to avoid implicit type casting when + // calling it + atomicAdd(c_style_pointer_cast(&p_data_[i]), x); + } + } } __host__ __device__ static constexpr bool IsStaticBuffer() { return false; } @@ -333,14 +328,14 @@ struct DynamicBuffer __host__ __device__ static constexpr bool IsDynamicBuffer() { return true; } }; -template +template __host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size) { return DynamicBuffer{p, element_space_size}; } template < - AddressSpaceEnum_t BufferAddressSpace, + AddressSpaceEnum BufferAddressSpace, typename T, typename ElementSpaceSize, typename X, @@ -353,4 +348,3 @@ make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element } } // namespace ck -#endif diff --git a/include/ck/utility/utility.hpp b/include/ck/utility/get_id.hpp similarity index 88% rename from include/ck/utility/utility.hpp rename to include/ck/utility/get_id.hpp index 7664066126..f742512d40 100644 --- a/include/ck/utility/utility.hpp +++ b/include/ck/utility/get_id.hpp @@ -1,6 +1,4 @@ -#ifndef CK_UTILITY_HPP -#define CK_UTILITY_HPP - +#pragma once #include "config.hpp" namespace ck { @@ -16,5 +14,3 @@ __device__ index_t get_block_1d_id() { return blockIdx.x; } __device__ index_t get_grid_size() { return gridDim.x; } } // namespace ck - -#endif diff --git a/include/ck/utility/multi_index.hpp b/include/ck/utility/multi_index.hpp index 0bb34fb1e2..f395b5ee71 100644 --- a/include/ck/utility/multi_index.hpp +++ b/include/ck/utility/multi_index.hpp @@ -3,7 +3,7 @@ #include "common_header.hpp" -#if CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX +#if CK_EXPERIMENTAL_USE_DYNAMICALLY_INDEXED_MULTI_INDEX #include "array_multi_index.hpp" #else #include "statically_indexed_array_multi_index.hpp" diff --git a/include/ck/utility/reduction_enums.hpp b/include/ck/utility/reduction_enums.hpp index e97108179e..9089fd6116 100644 --- a/include/ck/utility/reduction_enums.hpp +++ b/include/ck/utility/reduction_enums.hpp @@ -28,7 +28,7 @@ namespace ck { -enum class ReduceTensorOp_t +enum struct ReduceTensorOp { ADD = 0, MUL = 1, @@ -41,19 +41,19 @@ enum class ReduceTensorOp_t // MUL_NO_ZEROS = 8, }; -enum class NanPropagation_t +enum struct NanPropagation { NOT_PROPAGATE_NAN = 0, PROPAGATE_NAN = 1, }; -enum class ReduceTensorIndices_t +enum struct ReduceTensorIndices { NO_INDICES = 0, FLATTENED_INDICES = 1, }; -enum class IndicesType_t +enum struct IndicesType { INDICES_32BIT = 0, INDICES_64BIT = 1, diff --git a/include/ck/utility/static_buffer.hpp b/include/ck/utility/static_buffer.hpp index add59cf843..f36328fa5f 100644 --- a/include/ck/utility/static_buffer.hpp +++ b/include/ck/utility/static_buffer.hpp @@ -6,7 +6,7 @@ namespace ck { // static buffer for scalar -template // TODO remove this bool, no longer needed @@ -17,10 +17,7 @@ struct StaticBuffer : public StaticallyIndexedArray __host__ __device__ constexpr StaticBuffer() : base{} {} - __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace() - { - return AddressSpace; - } + __host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace() { return AddressSpace; } __host__ __device__ static constexpr bool IsStaticBuffer() { return true; } @@ -42,7 +39,7 @@ struct StaticBuffer : public StaticallyIndexedArray }; // static buffer for vector -template +template __host__ __device__ constexpr auto make_static_buffer(Number) { return StaticBuffer{}; diff --git a/include/ck/utility/synchronization.hpp b/include/ck/utility/synchronization.hpp index da74f2074d..d46628d913 100644 --- a/include/ck/utility/synchronization.hpp +++ b/include/ck/utility/synchronization.hpp @@ -7,7 +7,7 @@ namespace ck { __device__ void block_sync_lds() { -#if CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM +#if CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM asm volatile("\ s_waitcnt lgkmcnt(0) \n \ s_barrier \ diff --git a/library/include/ck/library/host_tensor/conv_common.hpp b/library/include/ck/library/host_tensor/conv_common.hpp index 8c11abda49..b60af7d664 100644 --- a/library/include/ck/library/host_tensor/conv_common.hpp +++ b/library/include/ck/library/host_tensor/conv_common.hpp @@ -75,14 +75,14 @@ calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDes } template -inline auto activ(T v, const ck::ActivTypeEnum_t activ_type) +inline auto activ(T v, const ck::ActivTypeEnum activ_type) { const T alpha = 0.3; switch(activ_type) { - case ck::ActivTypeEnum_t::None: return v; - case ck::ActivTypeEnum_t::LeakyRelu: return (v >= 0 ? v : alpha * v); - case ck::ActivTypeEnum_t::Sigmoid: return (1 / (1 + exp(-v))); + case ck::ActivTypeEnum::None: return v; + case ck::ActivTypeEnum::LeakyRelu: return (v >= 0 ? v : alpha * v); + case ck::ActivTypeEnum::Sigmoid: return (1 / (1 + exp(-v))); default: throw std::runtime_error("unsupported activ type"); break; } } diff --git a/library/include/ck/library/host_tensor/device_tensor.hpp b/library/include/ck/library/host_tensor/device_tensor.hpp index 1a7a34a4cf..b8d3ccc8a0 100644 --- a/library/include/ck/library/host_tensor/device_tensor.hpp +++ b/library/include/ck/library/host_tensor/device_tensor.hpp @@ -1,6 +1,5 @@ #pragma once #include "host_tensor.hpp" -#include "common_header.hpp" template void ostream_tensor_descriptor(TensorDesc, std::ostream& os = std::cout) diff --git a/library/include/ck/library/host_tensor/host_reduce_util.hpp b/library/include/ck/library/host_tensor/host_reduce_util.hpp index f5e01ccc94..cf301bb18a 100644 --- a/library/include/ck/library/host_tensor/host_reduce_util.hpp +++ b/library/include/ck/library/host_tensor/host_reduce_util.hpp @@ -39,8 +39,8 @@ namespace ck { namespace host_reduce { -using ck::NanPropagation_t; -using ck::ReduceTensorOp_t; +using ck::NanPropagation; +using ck::ReduceTensorOp; template static inline bool float_equal_one(T); @@ -66,44 +66,44 @@ static inline bool float_equal_zero(half_float::half x) return x == static_cast(0.0f); }; -template +template __host__ static inline std::function PreUnaryOpFn(int) { using std::abs; - if constexpr(ReduceOpId == ReduceTensorOp_t::NORM1) + if constexpr(ReduceOpId == ReduceTensorOp::NORM1) { return ([&](AccDataType& a_) { a_ = abs(a_); }); } - else if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2) + else if constexpr(ReduceOpId == ReduceTensorOp::NORM2) { return ([&](AccDataType& a_) { a_ = a_ * a_; }); } - else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX) + else if constexpr(ReduceOpId == ReduceTensorOp::AMAX) { return ([&](AccDataType& a_) { a_ = abs(a_); }); } else { - // ReduceTensorOp_t::AVG: - // ReduceTensorOp_t::ADD: - // ReduceTensorOp_t::MUL: - // ReduceTensorOp_t::MIN: - // ReduceTensorOp_t::MAX: + // ReduceTensorOp::AVG: + // ReduceTensorOp::ADD: + // ReduceTensorOp::MUL: + // ReduceTensorOp::MIN: + // ReduceTensorOp::MAX: return ([&](AccDataType&) {}); }; }; -template +template __host__ static inline std::function PosUnaryOpFn(int32_t divider) { using std::sqrt; - if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2) + if constexpr(ReduceOpId == ReduceTensorOp::NORM2) { return ([&](AccDataType& a_) { a_ = sqrt(a_); }); } - else if constexpr(ReduceOpId == ReduceTensorOp_t::AVG) + else if constexpr(ReduceOpId == ReduceTensorOp::AVG) { return ([&, divider](AccDataType& a_) { a_ = a_ / static_cast(static_cast(divider)); @@ -111,36 +111,36 @@ __host__ static inline std::function PosUnaryOpFn(int32_t di } else { - // ReduceTensorOp_t::ADD: - // ReduceTensorOp_t::NORM1: - // ReduceTensorOp_t::MUL: - // ReduceTensorOp_t::MIN: - // ReduceTensorOp_t::MAX: - // ReduceTensorOp_t::AMAX: + // ReduceTensorOp::ADD: + // ReduceTensorOp::NORM1: + // ReduceTensorOp::MUL: + // ReduceTensorOp::MIN: + // ReduceTensorOp::MAX: + // ReduceTensorOp::AMAX: return ([&](AccDataType&) {}); } }; -template +template __host__ static inline std::function ReduceOpFn() { - if constexpr(ReduceOpId == ReduceTensorOp_t::ADD || ReduceOpId == ReduceTensorOp_t::AVG || - ReduceOpId == ReduceTensorOp_t::NORM1 || ReduceOpId == ReduceTensorOp_t::NORM2) + if constexpr(ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::AVG || + ReduceOpId == ReduceTensorOp::NORM1 || ReduceOpId == ReduceTensorOp::NORM2) { return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ + b_; }); } - else if constexpr(ReduceOpId == ReduceTensorOp_t::MUL) + else if constexpr(ReduceOpId == ReduceTensorOp::MUL) { return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ * b_; }); } - else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN) + else if constexpr(ReduceOpId == ReduceTensorOp::MIN) { return ([&](AccDataType& a_, AccDataType b_) { if(a_ > b_) a_ = b_; }); } - else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX) + else if constexpr(ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX) { return ([&](AccDataType& a_, AccDataType b_) { if(a_ < b_) @@ -149,10 +149,10 @@ __host__ static inline std::function ReduceOpFn } }; -template +template __host__ static inline std::function ReduceOpFn2() { - if constexpr(ReduceOpId == ReduceTensorOp_t::MIN) + if constexpr(ReduceOpId == ReduceTensorOp::MIN) { return ([&](AccDataType& a_, AccDataType b_, bool& changed) { if(a_ > b_) @@ -164,7 +164,7 @@ __host__ static inline std::function{}); }; }; -template +template __host__ static inline AccDataType ReduceOpZeroVal() { - if constexpr(ReduceOpId == ReduceTensorOp_t::MUL) + if constexpr(ReduceOpId == ReduceTensorOp::MUL) { return (static_cast(1.0f)); } - else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN) + else if constexpr(ReduceOpId == ReduceTensorOp::MIN) { return (std::numeric_limits::max()); } - else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX) + else if constexpr(ReduceOpId == ReduceTensorOp::MAX) { return (std::numeric_limits::lowest()); } - else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX) + else if constexpr(ReduceOpId == ReduceTensorOp::AMAX) { return (static_cast(0.0f)); } else { - // ReduceTensorOp_t::ADD - // ReduceTensorOp_t::AVG - // ReduceTensorOp_t::NORM1 - // ReduceTensorOp_t::NORM2 + // ReduceTensorOp::ADD + // ReduceTensorOp::AVG + // ReduceTensorOp::NORM1 + // ReduceTensorOp::NORM2 return (static_cast(0.0f)); }; }; diff --git a/library/include/ck/library/host_tensor/host_reduction.hpp b/library/include/ck/library/host_tensor/host_reduction.hpp index 4cc8f3fefd..f25d753a46 100644 --- a/library/include/ck/library/host_tensor/host_reduction.hpp +++ b/library/include/ck/library/host_tensor/host_reduction.hpp @@ -104,7 +104,7 @@ static size_t get_offset_from_index(const std::vector& strides, template & a_k_m, ABType, AccType, CType, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, decltype(a_k0_m_k1_grid_desc), decltype(b_k0_n_k1_grid_desc), decltype(c_m_n_grid_desc), diff --git a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_km_kn_nm.hpp b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_km_kn_nm.hpp index abaaf32113..eb78ba96d8 100644 --- a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_km_kn_nm.hpp +++ b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_km_kn_nm.hpp @@ -202,7 +202,7 @@ void device_gemm_xdlops_km_kn_nm(const Tensor& a_k_m, ABType, AccType, CType, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, decltype(a_k0_m_k1_grid_desc), decltype(b_k0_n_k1_grid_desc), decltype(c_m_n_grid_desc), diff --git a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_km_nk_mn.hpp b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_km_nk_mn.hpp index 0a97d361d4..dbd318ce4d 100644 --- a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_km_nk_mn.hpp +++ b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_km_nk_mn.hpp @@ -398,7 +398,7 @@ void device_gemm_xdlops_km_nk_mn(const Tensor& a_k_m, ABType, AccType, CType, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, decltype(a_k0_m_k1_grid_desc), decltype(b_k0_n_k1_grid_desc), decltype(c_m_n_grid_desc), diff --git a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_km_nk_nm.hpp b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_km_nk_nm.hpp index d51caa3847..5b819fd1af 100644 --- a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_km_nk_nm.hpp +++ b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_km_nk_nm.hpp @@ -202,7 +202,7 @@ void device_gemm_xdlops_km_nk_nm(const Tensor& a_k_m, ABType, AccType, CType, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, decltype(a_k0_m_k1_grid_desc), decltype(b_k0_n_k1_grid_desc), decltype(c_m_n_grid_desc), diff --git a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_kn_mn.hpp b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_kn_mn.hpp index 30ede2517b..4b041777c3 100644 --- a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_kn_mn.hpp +++ b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_kn_mn.hpp @@ -398,7 +398,7 @@ void device_gemm_xdlops_mk_kn_mn(const Tensor& a_m_k, ABType, AccType, CType, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, decltype(a_k0_m_k1_grid_desc), decltype(b_k0_n_k1_grid_desc), decltype(c_m_n_grid_desc), diff --git a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_kn_nm.hpp b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_kn_nm.hpp index 58ac3880d6..c848cd7936 100644 --- a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_kn_nm.hpp +++ b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_kn_nm.hpp @@ -230,7 +230,7 @@ void device_gemm_xdlops_mk_kn_nm(const Tensor& a_m_k, ABType, AccType, CType, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, decltype(a_k0_m_k1_grid_desc), decltype(b_k0_n_k1_grid_desc), decltype(c_m_n_grid_desc), diff --git a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_nk_mn.hpp b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_nk_mn.hpp index e99d570413..557624026d 100644 --- a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_nk_mn.hpp +++ b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_nk_mn.hpp @@ -499,7 +499,7 @@ void device_gemm_xdlops_mk_nk_mn(const Tensor& a_m_k, ABType, AccType, CType, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, decltype(a_k0_m_k1_grid_desc), decltype(b_k0_n_k1_grid_desc), decltype(c_m_n_grid_desc), diff --git a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_nk_nm.hpp b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_nk_nm.hpp index a12cf0733a..06d8ed2940 100644 --- a/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_nk_nm.hpp +++ b/library/include/ck/library/obselete_driver_offline/device_gemm_xdlops_mk_nk_nm.hpp @@ -286,7 +286,7 @@ void device_gemm_xdlops_mk_nk_nm(const Tensor& a_m_k, ABType, AccType, CType, - InMemoryDataOperationEnum_t::Set, + InMemoryDataOperationEnum::Set, decltype(a_k0_m_k1_grid_desc), decltype(b_k0_n_k1_grid_desc), decltype(c_m_n_grid_desc), diff --git a/library/include/ck/library/obselete_driver_offline/driver_contraction_dlops_v1r2.hpp b/library/include/ck/library/obselete_driver_offline/driver_contraction_dlops_v1r2.hpp index d207728a2e..000098f4fc 100644 --- a/library/include/ck/library/obselete_driver_offline/driver_contraction_dlops_v1r2.hpp +++ b/library/include/ck/library/obselete_driver_offline/driver_contraction_dlops_v1r2.hpp @@ -10,7 +10,7 @@ template + ck::ActivTypeEnum activ_type> struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_add { template + ck::ActivTypeEnum activ_type> struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad { template + ck::ActivTypeEnum activ_type> struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool { template ; #endif -template +template using deviceReduceBlockWisePtrType = DeviceReducePtr< typename reduce_unary_operator::InElementwiseOperation, typename reduce_unary_operator::AccElementwiseOperation>; @@ -57,9 +57,9 @@ template + ReduceTensorOp ReduceOpId, + NanPropagation NanOpt, + ReduceTensorIndices IndicesOpt> void add_device_reduce_instance_blockwise( std::vector>& device_op_instances) { @@ -71,11 +71,11 @@ void add_device_reduce_instance_blockwise( AccElementwiseOperation; constexpr bool Indexable = - (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || - ReduceOpId == ReduceTensorOp_t::AMAX); - constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES); + (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || + ReduceOpId == ReduceTensorOp::AMAX); + constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES); - constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; + constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true; static_for<0, std::tuple_size::value, 1>{}([&](auto i) { using cfg1 = @@ -123,15 +123,15 @@ void add_device_reduce_instance_blockwise( IndicesOpt>( \ std::vector> & device_op_instances) -#define ADD_BLOCKWISE_INST_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_BLOCKWISE_INST_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ +#define ADD_BLOCKWISE_INST_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ + ADD_BLOCKWISE_INST_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ NumReduceDim) #define ADD_BLOCKWISE_INST_REF_BY_TYPE( \ @@ -150,15 +150,15 @@ void add_device_reduce_instance_blockwise( AccElementwiseOperation>> & \ device_op_instances) -#define ADD_BLOCKWISE_INST_REF_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ +#define ADD_BLOCKWISE_INST_REF_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ + ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ NumReduceDim) } // namespace device_reduce_instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp index 5a0c18e7a3..8e47bbfb6a 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp @@ -34,7 +34,7 @@ using reduce_configuration_2_instances_blockwise_second_call = std::tuple< >; #endif -template +template using deviceReduceBlockWiseSecondCallPtrType = DeviceReducePtr< typename reduce_unary_operator::InElementwiseOperation, typename reduce_unary_operator::AccElementwiseOperation>; @@ -44,9 +44,9 @@ template + ReduceTensorOp ReduceOpId, + NanPropagation NanOpt, + ReduceTensorIndices IndicesOpt> void add_device_reduce_instance_blockwise_second_call( std::vector>& device_op_instances) @@ -60,11 +60,11 @@ void add_device_reduce_instance_blockwise_second_call( AccElementwiseOperation; constexpr bool Indexable = - (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || - ReduceOpId == ReduceTensorOp_t::AMAX); - constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES); + (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || + ReduceOpId == ReduceTensorOp::AMAX); + constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES); - constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; + constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true; static_assert(std::is_same::value, "InDataType and AccDataType should be the same to use " @@ -117,15 +117,15 @@ void add_device_reduce_instance_blockwise_second_call( std::vector> & \ device_op_instances) -#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ +#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ + ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ NumReduceDim) #define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE( \ @@ -145,15 +145,15 @@ void add_device_reduce_instance_blockwise_second_call( AccElementwiseOperation>> & \ device_op_instances) -#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ +#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ + ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ NumReduceDim) } // namespace device_reduce_instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp index 3b317e1d80..bf10080b5e 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp @@ -47,7 +47,7 @@ using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple< >; #endif -template +template using deviceReduceMultiBlockAtomicAddPtrType = DeviceReducePtr:: InElementwiseOperation, @@ -59,9 +59,9 @@ template + ReduceTensorOp ReduceOpId, + NanPropagation NanOpt, + ReduceTensorIndices IndicesOpt> void add_device_reduce_instance_multiblock_atomic_add( std::vector>& device_op_instances) @@ -74,18 +74,18 @@ void add_device_reduce_instance_multiblock_atomic_add( AccElementwiseOperation; constexpr bool Indexable = - (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || - ReduceOpId == ReduceTensorOp_t::AMAX); - constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES); + (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || + ReduceOpId == ReduceTensorOp::AMAX); + constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES); - constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; + constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true; - static_assert(IndicesOpt == ReduceTensorIndices_t::NO_INDICES, + static_assert(IndicesOpt == ReduceTensorIndices::NO_INDICES, "AtomicAdd can only be used with reduction operations without indices!"); constexpr bool op_acceptable = - (ReduceOpId == ReduceTensorOp_t::ADD || ReduceOpId == ReduceTensorOp_t::MUL || - ReduceOpId == ReduceTensorOp_t::AVG || ReduceOpId == ReduceTensorOp_t::NORM1); + (ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::MUL || + ReduceOpId == ReduceTensorOp::AVG || ReduceOpId == ReduceTensorOp::NORM1); constexpr bool out_type_acceptable = (std::is_same::value || std::is_same::value); @@ -144,15 +144,15 @@ void add_device_reduce_instance_multiblock_atomic_add( std::vector> & \ device_op_instances) -#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ +#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ + ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ NumReduceDim) #define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE( \ @@ -171,15 +171,15 @@ void add_device_reduce_instance_multiblock_atomic_add( AccElementwiseOperation>> & \ device_op_instances) -#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ +#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ + ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ NumReduceDim) } // namespace device_reduce_instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp index 8ab6328780..5c323ec175 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp @@ -46,7 +46,7 @@ using reduce_configuration_2_instances_multiblock_partial_reduce = std::tuple< >; #endif -template +template using deviceReduceMultiBlockPartialReducePtrType = DeviceReducePtr< typename reduce_unary_operator::InElementwiseOperation, typename reduce_unary_operator::AccElementwiseOperation>; @@ -56,9 +56,9 @@ template + ReduceTensorOp ReduceOpId, + NanPropagation NanOpt, + ReduceTensorIndices IndicesOpt> void add_device_reduce_instance_multiblock_partial_reduce( std::vector>& device_op_instances) @@ -72,11 +72,11 @@ void add_device_reduce_instance_multiblock_partial_reduce( AccElementwiseOperation; constexpr bool Indexable = - (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || - ReduceOpId == ReduceTensorOp_t::AMAX); - constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES); + (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || + ReduceOpId == ReduceTensorOp::AMAX); + constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES); - constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; + constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true; static_for<0, std::tuple_size::value, 1>{}([&](auto i) { using cfg1 = @@ -126,15 +126,15 @@ void add_device_reduce_instance_multiblock_partial_reduce( std::vector> & \ device_op_instances) -#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ +#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ + ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ NumReduceDim) #define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE( \ @@ -154,15 +154,15 @@ void add_device_reduce_instance_multiblock_partial_reduce( AccElementwiseOperation>> & \ device_op_instances) -#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ +#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ + ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ NumReduceDim) } // namespace device_reduce_instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp index 9371672a54..f3a0781c2b 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp @@ -47,7 +47,7 @@ using reduce_configuration_2_instances_threadwise = std::tuple< >; #endif -template +template using deviceReduceThreadWisePtrType = DeviceReducePtr< typename reduce_unary_operator::InElementwiseOperation, typename reduce_unary_operator::AccElementwiseOperation>; @@ -57,9 +57,9 @@ template + ReduceTensorOp ReduceOpId, + NanPropagation NanOpt, + ReduceTensorIndices IndicesOpt> void add_device_reduce_instance_threadwise( std::vector>& device_op_instances) { @@ -71,11 +71,11 @@ void add_device_reduce_instance_threadwise( AccElementwiseOperation; constexpr bool Indexable = - (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || - ReduceOpId == ReduceTensorOp_t::AMAX); - constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES); + (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || + ReduceOpId == ReduceTensorOp::AMAX); + constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES); - constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; + constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true; using cfg1 = ReductionConfiguration_1<256, 256, 1>; @@ -119,15 +119,15 @@ void add_device_reduce_instance_threadwise( IndicesOpt>( \ std::vector> & device_op_instances) -#define ADD_THREADWISE_INST_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_THREADWISE_INST_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ +#define ADD_THREADWISE_INST_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ + ADD_THREADWISE_INST_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ NumReduceDim) #define ADD_THREADWISE_INST_REF_BY_TYPE( \ @@ -146,15 +146,15 @@ void add_device_reduce_instance_threadwise( AccElementwiseOperation>> & \ device_op_instances) -#define ADD_THREADWISE_INST_REF_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_THREADWISE_INST_REF_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ +#define ADD_THREADWISE_INST_REF_BY_ID( \ + inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ + ADD_THREADWISE_INST_REF_BY_TYPE(inT, \ + compT, \ + outT, \ + static_cast(ReduceOpId), \ + static_cast(NanOpt), \ + static_cast(IndicesOpt), \ + Rank, \ NumReduceDim) } // namespace device_reduce_instance diff --git a/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp b/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp index 9c09936a3b..40337d674a 100644 --- a/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp +++ b/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp @@ -39,7 +39,7 @@ void host_direct_convolution_add_nchwc(const Tensor& in, const ConvDilations& conv_dilations, const InLeftPads& in_left_pads, const InRightPads&, - const ck::ActivTypeEnum_t activ_type) + const ck::ActivTypeEnum activ_type) { using namespace ck; @@ -117,7 +117,7 @@ int main(int argc, char* argv[]) exit(1); } - constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; + constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu; const ConvForwardAlgo algo = static_cast(std::stoi(argv[1])); const bool do_verification = std::stoi(argv[2]); @@ -167,7 +167,7 @@ int main(int argc, char* argv[]) const bool do_log = std::stoi(argv[4]); const int nrepeat = std::stoi(argv[5]); - constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; + constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu; #if 0 constexpr auto N = Number<1>{}; diff --git a/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp b/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp index 6f28af8bd3..4b3e037fc0 100644 --- a/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp +++ b/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp @@ -37,7 +37,7 @@ void host_direct_convolution_nchwc(const Tensor& in, const ConvDilations& conv_dilations, const InLeftPads& in_left_pads, const InRightPads&, - const ck::ActivTypeEnum_t activ_type) + const ck::ActivTypeEnum activ_type) { using namespace ck; @@ -102,7 +102,7 @@ int main(int argc, char* argv[]) exit(1); } - constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; + constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu; const ConvForwardAlgo algo = static_cast(std::stoi(argv[1])); const bool do_verification = std::stoi(argv[2]); @@ -149,8 +149,8 @@ int main(int argc, char* argv[]) const bool do_log = std::stoi(argv[4]); const int nrepeat = std::stoi(argv[5]); - // constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::Sigmoid; - constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; + // constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::Sigmoid; + constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu; #if 0 constexpr auto N = Number<1>{}; diff --git a/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp b/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp index 846ce94f91..c3e6027925 100644 --- a/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp +++ b/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp @@ -38,7 +38,7 @@ void host_direct_convolution_maxpool_nchwc(const Tensor& in, const ConvDilations& conv_dilations, const InLeftPads& in_left_pads, const InRightPads&, - const ck::ActivTypeEnum_t activ_type) + const ck::ActivTypeEnum activ_type) { using namespace ck; @@ -126,7 +126,7 @@ int main(int argc, char* argv[]) exit(1); } - constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; + constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu; const ConvForwardAlgo algo = static_cast(std::stoi(argv[1])); const bool do_verification = std::stoi(argv[2]); @@ -176,18 +176,18 @@ int main(int argc, char* argv[]) const bool do_log = std::stoi(argv[4]); const int nrepeat = std::stoi(argv[5]); - constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; + constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu; #if 1 - constexpr auto N = Number<1>{}; - constexpr auto Hi = Number<1080>{}; - constexpr auto Wi = Number<1920>{}; - constexpr auto Y = Number<3>{}; - constexpr auto X = Number<3>{}; - constexpr auto C0 = Number<2>{}; - constexpr auto C1 = Number<8>{}; - constexpr auto K0 = Number<2>{}; - constexpr auto K1 = Number<8>{}; + constexpr auto N = Number<1>{}; + constexpr auto Hi = Number<1080>{}; + constexpr auto Wi = Number<1920>{}; + constexpr auto Y = Number<3>{}; + constexpr auto X = Number<3>{}; + constexpr auto C0 = Number<2>{}; + constexpr auto C1 = Number<8>{}; + constexpr auto K0 = Number<2>{}; + constexpr auto K1 = Number<8>{}; #elif 0 constexpr auto N = Number<1>{}; constexpr auto Hi = Number<1080>{}; diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp index 0144081160..61b9303c40 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp @@ -23,7 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; using ReduceSum = ck::tensor_operation::element_wise::ReduceSum; using ReduceSquareSum = ck::tensor_operation::element_wise::ReduceSquareSum; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // c[g, m, n] = a[g, m, k] * b[g, n, k] // d0[g, m] = reduce0(c[g, m, n]) diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp index 873bd1c847..e8c3ca2c2a 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp @@ -23,7 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; using ReduceSum = ck::tensor_operation::element_wise::ReduceSum; using ReduceSquareSum = ck::tensor_operation::element_wise::ReduceSquareSum; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // c[g, m, n] = a[g, m, k] * b[g, n, k] // d0[g, m] = reduce0(c[g, m, n]) diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp index ec94ed2ace..1216dbf73c 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp @@ -23,7 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; using ReduceSum = ck::tensor_operation::element_wise::ReduceSum; using ReduceSquareSum = ck::tensor_operation::element_wise::ReduceSquareSum; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // c[g, m, n] = a[g, m, k] * b[g, n, k] // d0[g, m] = reduce0(c[g, m, n]) diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp index ad7e70b31b..83921ce728 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp @@ -23,7 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; using ReduceSum = ck::tensor_operation::element_wise::ReduceSum; using ReduceSquareSum = ck::tensor_operation::element_wise::ReduceSquareSum; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // c[g, m, n] = a[g, m, k] * b[g, n, k] // d0[g, m] = reduce0(c[g, m, n]) diff --git a/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instance.cpp index 2fcb64a5a7..9288e40e56 100644 --- a/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instance.cpp @@ -18,13 +18,13 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; static constexpr auto ConvFwd1x1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0; static constexpr auto ConvFwd1x1S1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instance.cpp index 11301ee8e6..669dca617a 100644 --- a/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instance.cpp @@ -18,13 +18,13 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; static constexpr auto ConvFwd1x1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0; static constexpr auto ConvFwd1x1S1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instance.cpp index 8702d18596..0abd47142b 100644 --- a/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instance.cpp @@ -17,13 +17,13 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; static constexpr auto ConvFwd1x1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0; static constexpr auto ConvFwd1x1S1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; //------------------------------------------------------------------------------ // Conv1D diff --git a/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instance.cpp index eeabd00875..53e0f77550 100644 --- a/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instance.cpp @@ -17,13 +17,13 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; static constexpr auto ConvFwd1x1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0; static constexpr auto ConvFwd1x1S1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances = diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp index 3d7e3d3b4b..b5814aa17f 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp @@ -17,10 +17,10 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvBwdDataDefault = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default; static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp index 556be415f1..53498aff34 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp @@ -17,10 +17,10 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvBwdDataDefault = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default; static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp index 215156398b..fbe279e033 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp @@ -16,10 +16,10 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvBwdDataDefault = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default; static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp index 38f79bf937..7fd51bbfbf 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp @@ -17,10 +17,10 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvBwdDataDefault = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default; static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp index 1e93de9cbb..b2f6f9335e 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp @@ -18,16 +18,16 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; static constexpr auto ConvFwd1x1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0; static constexpr auto ConvFwd1x1S1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; static constexpr auto ConvFwdOddC = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::OddC; + ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC; // arbitrary conv using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp index 50ce68fd71..47405ea1bf 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp @@ -18,13 +18,13 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; static constexpr auto ConvFwd1x1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0; static constexpr auto ConvFwd1x1S1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp index beaad1d3b4..a4060f8bf2 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp @@ -18,13 +18,13 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; static constexpr auto ConvFwd1x1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0; static constexpr auto ConvFwd1x1S1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp index 402d65a6e0..3c46c2f7e9 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp @@ -17,13 +17,13 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; static constexpr auto ConvFwd1x1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0; static constexpr auto ConvFwd1x1S1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp index 90e0320cff..0db59ca394 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp @@ -17,13 +17,13 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; static constexpr auto ConvFwd1x1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0; static constexpr auto ConvFwd1x1S1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp index 35a88ac5f1..9c3f0a4b96 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp @@ -18,19 +18,19 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; using AddRelu = ck::tensor_operation::element_wise::AddRelu; -static constexpr auto MemorySet = ck::InMemoryDataOperationEnum_t::Set; +static constexpr auto MemorySet = ck::InMemoryDataOperationEnum::Set; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; static constexpr auto ConvFwd1x1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0; static constexpr auto ConvFwd1x1S1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; static constexpr auto ConvFwdOddC = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::OddC; + ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC; // arbitrary conv using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp index 00f270a8d3..b9f46e2611 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp @@ -19,16 +19,16 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; using AddReluAdd = ck::tensor_operation::element_wise::AddReluAdd; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; static constexpr auto ConvFwd1x1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0; static constexpr auto ConvFwd1x1S1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; static constexpr auto ConvFwdOddC = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::OddC; + ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC; // arbitrary conv using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instance.cpp index 1c9a4b989c..c56ad270aa 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instance.cpp @@ -18,10 +18,10 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; using AddRelu = ck::tensor_operation::element_wise::AddRelu; -static constexpr auto InMemoryAtomicAdd = ck::InMemoryDataOperationEnum_t::AtomicAdd; +static constexpr auto InMemoryAtomicAdd = ck::InMemoryDataOperationEnum::AtomicAdd; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; using device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instances = std::tuple< // clang-format off diff --git a/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp index 5f1ec52069..745d26904a 100644 --- a/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp @@ -18,13 +18,13 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; static constexpr auto ConvFwd1x1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0; static constexpr auto ConvFwd1x1S1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp index 406c56d2b4..4d51180e72 100644 --- a/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp @@ -18,13 +18,13 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; static constexpr auto ConvFwd1x1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0; static constexpr auto ConvFwd1x1S1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp index 2bf65ba078..9a8ff8d714 100644 --- a/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp @@ -17,13 +17,13 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; static constexpr auto ConvFwd1x1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0; static constexpr auto ConvFwd1x1S1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp index ea0259a3f1..7f54b66f9b 100644 --- a/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp @@ -17,13 +17,13 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; static constexpr auto ConvFwd1x1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0; static constexpr auto ConvFwd1x1S1P0 = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances = diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp index 30dba23903..5c915dcc42 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp @@ -9,7 +9,7 @@ namespace tensor_operation { namespace device { namespace device_conv2d_bwd_data_instance { -using BF16 = ushort; +using BF16 = bhalf_t; using F32 = float; template @@ -17,10 +17,10 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvBwdDataDefault = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default; static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances = diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp index cc37fe4599..e8f7d4f11a 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp @@ -17,10 +17,10 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvBwdDataDefault = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default; static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances = diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp index 5444e5f727..b4c65ab66a 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp @@ -16,10 +16,10 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvBwdDataDefault = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default; static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances = diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp index 91fd4c075c..e3958ef689 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp @@ -17,10 +17,10 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvBwdDataDefault = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default; static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances = diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp index d563150567..2e4cd5cf31 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp @@ -9,7 +9,7 @@ namespace tensor_operation { namespace device { namespace device_conv2d_bwd_data_instance { -using BF16 = ushort; +using BF16 = bhalf_t; using F32 = float; template @@ -17,10 +17,10 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvBwdDataDefault = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default; static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances = diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp index bacdbbfa44..7170decc43 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp @@ -17,10 +17,10 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvBwdDataDefault = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default; static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances = diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp index 1b5c64e2fd..5a727b1113 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp @@ -16,10 +16,10 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvBwdDataDefault = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default; static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances = diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp index 776f96c601..3c53644ddc 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp @@ -17,10 +17,10 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvBwdDataDefault = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default; static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances = diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp index 5083e3c030..edbb7a14d9 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp @@ -9,7 +9,7 @@ namespace tensor_operation { namespace device { namespace device_conv2d_bwd_data_instance { -using BF16 = ushort; +using BF16 = bhalf_t; using F32 = float; template @@ -17,10 +17,10 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvBwdDataDefault = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default; static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances = diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp index 8d9a7aa2d3..5d00fa8f08 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp @@ -17,10 +17,10 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvBwdDataDefault = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default; static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances = diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp index f39318c0e6..d5cd04de6b 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp @@ -16,10 +16,10 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvBwdDataDefault = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default; static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances = diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp index 139141ee7d..d551970606 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp @@ -17,10 +17,10 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvBwdDataDefault = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default; static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 = - ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Filter1x1Stride1Pad0; + ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0; // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] using device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances = diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp index 0267618448..08047c7e52 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp @@ -20,7 +20,7 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // Compilation parameters for a[k, m] * b[k, n] = c[m, n] using device_gemm_xdl_f16_f16_f16_km_kn_mn_instances = diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp index a076821b9d..05cb080cbf 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp @@ -20,7 +20,7 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // Compilation parameters for a[k, m] * b[n, k] = c[m, n] using device_gemm_xdl_f16_f16_f16_km_nk_mn_instances = diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp index 0077f21260..4de989caf0 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp @@ -20,7 +20,7 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // Compilation parameters for a[m, k] * b[k, n] = c[m, n] using device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances = diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp index cee8a23fa7..633e2aac2e 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp @@ -20,8 +20,8 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; -static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization_t::MNPadding; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; +static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding; // Compilation parameters for a[m, k] * b[n, k] = c[m, n] using device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances = diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp index 713ea368a4..8284311102 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp @@ -20,7 +20,7 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // Compilation parameters for a[k, m] * b[k, n] = c[m, n] using device_gemm_xdl_f32_f32_f32_km_kn_mn_instances = diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp index ce5dc4dda6..235c4771f9 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp @@ -20,7 +20,7 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // Compilation parameters for a[k, m] * b[n, k] = c[m, n] using device_gemm_xdl_f32_f32_f32_km_nk_mn_instances = diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp index f77870e28d..b7000bddf8 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp @@ -20,7 +20,7 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // Compilation parameters for a[m, k] * b[k, n] = c[m, n] using device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances = diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp index 8eae06dbf4..1b4f23141b 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp @@ -20,7 +20,7 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // Compilation parameters for a[m, k] * b[n, k] = c[m, n] using device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances = diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp index 7103da5324..26ec965bb5 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp @@ -20,7 +20,7 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // Compilation parameters for a[m, k] * b[k, n] = c[m, n] using device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp index fb41ab56d9..45e3f9f940 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp @@ -20,7 +20,7 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // Compilation parameters for a[m, k] * b[k, n] = c[m, n] using device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp index 67928073cd..042ac2b8ca 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp @@ -20,7 +20,7 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // Compilation parameters for a[m, k] * b[k, n] = c[m, n] using device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp index 346b1a4bec..21fdb7cd9d 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp @@ -20,7 +20,7 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // Compilation parameters for a[m, k] * b[k, n] = c[m, n] using device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp index a3ce0cdca0..971bdcad58 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp @@ -20,7 +20,7 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // Compilation parameters for a[k, m] * b[k, n] = c[m, n] using device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp index 2795acbdfd..3b7bdb87be 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp @@ -20,7 +20,7 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // Compilation parameters for a[k, m] * b[n, k] = c[m, n] using device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp index 3527f36222..8366616246 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp @@ -20,7 +20,7 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization_t::MNPadding; +static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding; // Compilation parameters for a[m, k] * b[k, n] = c[m, n] using device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp index 715ba3e0bd..396de62cfb 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp @@ -20,7 +20,7 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // Compilation parameters for a[m, k] * b[n, k] = c[m, n] using device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp index fe4aaef943..4cd08994b3 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp @@ -23,7 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; using ReduceSum = ck::tensor_operation::element_wise::ReduceSum; using ReduceSquareSum = ck::tensor_operation::element_wise::ReduceSquareSum; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // c[m, n] = a[k, m] * b[k, n] // d0[m] = reduce0(c[m, n]) diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp index 4ffdf84f8b..4e58b149fa 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp @@ -23,7 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; using ReduceSum = ck::tensor_operation::element_wise::ReduceSum; using ReduceSquareSum = ck::tensor_operation::element_wise::ReduceSquareSum; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // c[m, n] = a[k, m] * b[n, k] // d0[m] = reduce0(c[m, n]) diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp index 3c9aad584b..64933bd129 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp @@ -23,7 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; using ReduceSum = ck::tensor_operation::element_wise::ReduceSum; using ReduceSquareSum = ck::tensor_operation::element_wise::ReduceSquareSum; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // c[m, n] = a[m, k] * b[n, k] // d0[m] = reduce0(c[m, n]) diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp index 7de3c627df..fa9de81f85 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp @@ -23,7 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; using ReduceSum = ck::tensor_operation::element_wise::ReduceSum; using ReduceSquareSum = ck::tensor_operation::element_wise::ReduceSquareSum; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // c[m, n] = a[m, k] * b[n, k] // d0[m] = reduce0(c[m, n]) diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp index 20caafa7de..19f1011c3f 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp @@ -20,7 +20,7 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // Compilation parameters for a[k, m] * b[k, n] = c[m, n] using device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp index 20c970cebe..59e0d24055 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp @@ -20,7 +20,7 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // Compilation parameters for a[k, m] * b[n, k] = c[m, n] using device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp index b16d2b84c9..35052ae8a9 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp @@ -20,7 +20,7 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // Compilation parameters for a[m, k] * b[k, n] = c[m, n] using device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances = std::tuple< diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp index 5a6f64b9da..cb41d2724c 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp @@ -20,8 +20,8 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; -static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization_t::MNPadding; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; +static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding; // Compilation parameters for a[m, k] * b[n, k] = c[m, n] using device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances = std::tuple< diff --git a/profiler/README.md b/profiler/README.md new file mode 100644 index 0000000000..bfd6a3a53b --- /dev/null +++ b/profiler/README.md @@ -0,0 +1,48 @@ +## Profile GEMM kernels +```bash +#arg1: tensor operation (gemm=GEMM) +#arg2: data type (0=fp32, 1=fp16) +#arg3: matrix layout (0=NN, 1=NT, 2=TN, 3=TT) +#arg4: verification (0=no, 1=yes) +#arg5: initialization (0=no init, 1=integer value, 2=decimal value) +#arg6: print matrix value (0=no, 1=yes) +#arg7: run kernel # of times (>1) +#arg8 to 13: M, N, K, StrideA, StrideB, StrideC + +################ op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC +./bin/ckProfiler gemm 1 1 1 1 0 5 3840 4096 4096 4096 4096 4096 +``` + +Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16) +```bash +a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1} +b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096} +c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1} +.... +Best Perf: 1.1933 ms, 107.977 TFlops, 79.0848 GB/s +``` + +## Profile 2d forward convolution kernels +```bash +#arg1: tensor operation (conv=Convolution) +#arg2: data type (0=fp32, 1=fp16) +#arg3: input tensor layout (0=NCHW, 1=NHWC) +#arg4: weight tensor layout (0=KCYX, 1=KYXC) +#arg5: output tensor layout (0=NKHW, 1=NHWK) +#arg6: verification (0=no, 1=yes) +#arg7: initialization (0=no init, 1=integer value, 2=decimal value) +#arg8: print matrix value (0=no, 1=yes) +#arg9: run kernel # of times (>1) +#arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx + ################ op datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads + ./bin/ckProfiler conv2d_fwd 1 1 1 1 1 1 0 5 128 256 192 3 3 71 71 2 2 1 1 1 1 1 1 +``` + +Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16) +``` +in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192} +wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192} +out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256} +.... +Best Perf: 1.42509 ms, 102.988 TFlops, 234.086 GB/s +``` diff --git a/profiler/include/profile_convnd_bwd_data_impl.hpp b/profiler/include/profile_convnd_bwd_data_impl.hpp index c71d2cc907..8c15c13b26 100644 --- a/profiler/include/profile_convnd_bwd_data_impl.hpp +++ b/profiler/include/profile_convnd_bwd_data_impl.hpp @@ -12,7 +12,7 @@ using F16 = ck::half_t; using F32 = float; -using BF16 = ushort; +using BF16 = ck::bhalf_t; using INT8 = int8_t; namespace ck { namespace tensor_operation { diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp index 54068e234e..e5c7b5e656 100644 --- a/profiler/include/profile_reduce_impl.hpp +++ b/profiler/include/profile_reduce_impl.hpp @@ -64,9 +64,9 @@ template bool description_match(const DescriptionType& description, int Rank, const std::vector& reduceDims, - ReduceTensorOp_t ReduceOpId, - NanPropagation_t NanOpt, - ReduceTensorIndices_t IndicesOpt) + ReduceTensorOp ReduceOpId, + NanPropagation NanOpt, + ReduceTensorIndices IndicesOpt) { if(description.Rank_ != Rank || description.ReduceOpId_ != static_cast(ReduceOpId) || description.NanOpt_ != static_cast(NanOpt) || @@ -148,9 +148,9 @@ template + ReduceTensorOp ReduceOpId, + NanPropagation NanOpt, + ReduceTensorIndices IndicesOpt> void profile_reduce_impl_impl(bool do_verification, int init_method, bool do_log, @@ -166,17 +166,17 @@ void profile_reduce_impl_impl(bool do_verification, using namespace ck::host_reduce; constexpr bool op_support_indices = - (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || - ReduceOpId == ReduceTensorOp_t::AMAX); + (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || + ReduceOpId == ReduceTensorOp::AMAX); constexpr bool NeedIndices = - (op_support_indices && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES)); + (op_support_indices && (IndicesOpt != ReduceTensorIndices::NO_INDICES)); - constexpr bool PropagateNan = (NanOpt == NanPropagation_t::PROPAGATE_NAN); + constexpr bool PropagateNan = (NanOpt == NanPropagation::PROPAGATE_NAN); constexpr bool out_support_atomic_add = std::is_same::value; constexpr bool op_support_atomic_add = - !op_support_indices && ReduceOpId != ReduceTensorOp_t::NORM2; + !op_support_indices && ReduceOpId != ReduceTensorOp::NORM2; constexpr bool use_atomic_add = (out_support_atomic_add && op_support_atomic_add); // 1) If InDataType is half_t, must use half_t as AccDataType for indexable reduction operations @@ -194,7 +194,7 @@ void profile_reduce_impl_impl(bool do_verification, // 1) The indices can only be used when the reduction operation is indexable constexpr bool invalid_reduce_3 = - (!op_support_indices && IndicesOpt != ReduceTensorIndices_t::NO_INDICES); + (!op_support_indices && IndicesOpt != ReduceTensorIndices::NO_INDICES); // 1) If InDataType is int8_t, must use int8_t as AccDataType for indexable reduction operations // 2) If InDataType is int8_t, must use int32_t as AccDataType for non-indexable reduction @@ -207,8 +207,8 @@ void profile_reduce_impl_impl(bool do_verification, // 1) If InDataType is int8_t, the supported operation must be either indexable operations or // ADD/AVG constexpr bool invalid_reduce_5 = std::is_same::value && - (!op_support_indices && ReduceOpId != ReduceTensorOp_t::ADD && - ReduceOpId != ReduceTensorOp_t::AVG); + (!op_support_indices && ReduceOpId != ReduceTensorOp::ADD && + ReduceOpId != ReduceTensorOp::AVG); // 1) If InDataType is bhalf_t, must use float as AccDataType for all reduction operations constexpr bool invalid_reduce_6 = @@ -631,9 +631,9 @@ void profile_reduce_impl(bool do_verification, int nrepeat, const std::vector& inLengths, const std::vector& reduceDims, - ReduceTensorOp_t ReduceOpId, - NanPropagation_t NanOpt, - ReduceTensorIndices_t IndicesOpt, + ReduceTensorOp ReduceOpId, + NanPropagation NanOpt, + ReduceTensorIndices IndicesOpt, float alpha, float beta) { @@ -659,9 +659,9 @@ void profile_reduce_impl(bool do_verification, OutDataType, descType::Rank_, descType::NumReduceDim_, - static_cast(descType::ReduceOpId_), - static_cast(descType::NanOpt_), - static_cast(descType::IndicesOpt_)>( + static_cast(descType::ReduceOpId_), + static_cast(descType::NanOpt_), + static_cast(descType::IndicesOpt_)>( do_verification, init_method, do_log, diff --git a/profiler/src/README.md b/profiler/src/README.md deleted file mode 100644 index 55942e4834..0000000000 --- a/profiler/src/README.md +++ /dev/null @@ -1,81 +0,0 @@ -## Docker script -```bash -docker run \ --it \ ---rm \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ -rocm/tensorflow:rocm4.3.1-tf2.6-dev \ -/bin/bash -``` - -## Build ```ckProfiler``` -```bash -mkdir build && cd build -``` - -```bash -# Need to Specify target ID, example below is gfx908 -cmake \ --D BUILD_DEV=OFF \ --D CMAKE_BUILD_TYPE=Release \ --D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \ --D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_PREFIX_PATH=/opt/rocm \ -.. -``` - -```bash - make -j ckProfiler -``` - -## Profile GEMM kernels -```bash -#arg1: tensor operation (gemm=GEMM) -#arg2: data type (0=fp32, 1=fp16) -#arg3: matrix layout (0=NN, 1=NT, 2=TN, 3=TT) -#arg4: verification (0=no, 1=yes) -#arg5: initialization (0=no init, 1=integer value, 2=decimal value) -#arg6: print matrix value (0=no, 1=yes) -#arg7: run kernel # of times (>1) -#arg8 to 13: M, N, K, StrideA, StrideB, StrideC - -##################### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC -./profiler/ckProfiler gemm 1 1 1 1 0 5 3840 4096 4096 4096 4096 4096 -``` - -Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16) -```bash -a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1} -b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096} -c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1} -.... -Best Perf: 1.1933 ms, 107.977 TFlops, 79.0848 GB/s -``` - -## Profile forward convolution kernels -```bash -#arg1: tensor operation (conv=Convolution) -#arg2: data type (0=fp32, 1=fp16) -#arg3: input tensor layout (0=NCHW, 1=NHWC) -#arg4: weight tensor layout (0=KCYX, 1=KYXC) -#arg5: output tensor layout (0=NKHW, 1=NHWK) -#arg6: verification (0=no, 1=yes) -#arg7: initialization (0=no init, 1=integer value, 2=decimal value) -#arg8: print matrix value (0=no, 1=yes) -#arg9: run kernel # of times (>1) -#arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx - ##################### op datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads - ./profiler/ckProfiler conv_fwd 1 1 1 1 1 1 0 5 128 256 192 3 3 71 71 2 2 1 1 1 1 1 1 -``` - -Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16) -``` -in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192} -wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192} -out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256} -.... -Best Perf: 1.42509 ms, 102.988 TFlops, 234.086 GB/s -``` diff --git a/profiler/src/profile_batched_gemm_reduce.cpp b/profiler/src/profile_batched_gemm_reduce.cpp index 61f22ba003..38c3f52193 100644 --- a/profiler/src/profile_batched_gemm_reduce.cpp +++ b/profiler/src/profile_batched_gemm_reduce.cpp @@ -9,7 +9,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[]) { - enum struct GemmMatrixLayout_t + enum struct GemmMatrixLayout { MK_KN_MN, // 0 MK_NK_MN, // 1 @@ -17,7 +17,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[]) KM_NK_MN, // 3 }; - enum struct GemmReduceDataType_t + enum struct GemmReduceDataType { F32_F32_F32_F32_F32, // 0 F16_F16_F16_F32_F32, // 1 @@ -40,8 +40,8 @@ int profile_batched_gemm_reduce(int argc, char* argv[]) exit(1); } - const auto data_type = static_cast(std::stoi(argv[2])); - const auto layout = static_cast(std::stoi(argv[3])); + const auto data_type = static_cast(std::stoi(argv[2])); + const auto layout = static_cast(std::stoi(argv[3])); const bool do_verification = std::stoi(argv[4]); const int init_method = std::stoi(argv[5]); const bool do_log = std::stoi(argv[6]); @@ -57,8 +57,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[]) const int BatchCount = std::stoi(argv[14]); - if(data_type == GemmReduceDataType_t::F16_F16_F16_F32_F32 && - layout == GemmMatrixLayout_t::MK_KN_MN) + if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN) { ck::profiler::profile_batched_gemm_reduce_impl(std::stoi(argv[2])); - const int in_layout = static_cast(std::stoi(argv[3])); - const int wei_layout = static_cast(std::stoi(argv[4])); - const int out_layout = static_cast(std::stoi(argv[5])); + const auto data_type = static_cast(std::stoi(argv[2])); + const auto in_layout = static_cast(std::stoi(argv[3])); + const auto wei_layout = static_cast(std::stoi(argv[4])); + const auto out_layout = static_cast(std::stoi(argv[5])); const bool do_verification = std::stoi(argv[6]); const int init_method = std::stoi(argv[7]); const bool do_log = std::stoi(argv[8]); diff --git a/profiler/src/profile_gemm_reduce.cpp b/profiler/src/profile_gemm_reduce.cpp index 2149f3ce47..a83d4ce9a1 100644 --- a/profiler/src/profile_gemm_reduce.cpp +++ b/profiler/src/profile_gemm_reduce.cpp @@ -8,7 +8,7 @@ int profile_gemm_reduce(int argc, char* argv[]) { - enum struct GemmMatrixLayout_t + enum struct GemmMatrixLayout { MK_KN_MN, // 0 MK_NK_MN, // 1 @@ -16,7 +16,7 @@ int profile_gemm_reduce(int argc, char* argv[]) KM_NK_MN, // 3 }; - enum struct GemmReduceDataType_t + enum struct GemmReduceDataType { F32_F32_F32_F32_F32, // 0 F16_F16_F16_F32_F32, // 1 @@ -39,8 +39,8 @@ int profile_gemm_reduce(int argc, char* argv[]) exit(1); } - const auto data_type = static_cast(std::stoi(argv[2])); - const auto layout = static_cast(std::stoi(argv[3])); + const auto data_type = static_cast(std::stoi(argv[2])); + const auto layout = static_cast(std::stoi(argv[3])); const bool do_verification = std::stoi(argv[4]); const int init_method = std::stoi(argv[5]); const bool do_log = std::stoi(argv[6]); @@ -54,8 +54,7 @@ int profile_gemm_reduce(int argc, char* argv[]) const int StrideB = std::stoi(argv[12]); const int StrideC = std::stoi(argv[13]); - if(data_type == GemmReduceDataType_t::F16_F16_F16_F32_F32 && - layout == GemmMatrixLayout_t::MK_KN_MN) + if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN) { ck::profiler::profile_gemm_reduce_impl #include "profile_grouped_gemm_impl.hpp" -enum GemmMatrixLayout +enum struct GemmMatrixLayout { MK_KN_MN, // 0 MK_NK_MN, // 1 @@ -18,7 +18,7 @@ enum GemmMatrixLayout KM_NK_NM, // 7 }; -enum GemmDataType +enum struct GemmDataType { F32_F32_F32, // 0 F16_F16_F16, // 1 @@ -61,8 +61,8 @@ int profile_grouped_gemm(int argc, char* argv[]) exit(1); } - const int data_type = static_cast(std::stoi(argv[2])); - const int layout = static_cast(std::stoi(argv[3])); + const auto data_type = static_cast(std::stoi(argv[2])); + const auto layout = static_cast(std::stoi(argv[3])); const bool do_verification = std::stoi(argv[4]); const int init_method = std::stoi(argv[5]); const bool do_log = std::stoi(argv[6]); diff --git a/profiler/src/profile_reduce.cpp b/profiler/src/profile_reduce.cpp index b6a515b61f..c6dea1e385 100644 --- a/profiler/src/profile_reduce.cpp +++ b/profiler/src/profile_reduce.cpp @@ -20,9 +20,9 @@ using namespace std; -using ck::NanPropagation_t; -using ck::ReduceTensorIndices_t; -using ck::ReduceTensorOp_t; +using ck::NanPropagation; +using ck::ReduceTensorIndices; +using ck::ReduceTensorOp; static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'}, {"reduceDims", required_argument, nullptr, 'R'}, @@ -84,7 +84,7 @@ static std::vector getTypeValuesFromString(const char* cstr_values) return (values); } -enum struct appDataType_t +enum struct AppDataType { appHalf = 0, appFloat = 1, @@ -130,18 +130,18 @@ class AppArgs std::vector scales; - ReduceTensorOp_t reduceOp = ReduceTensorOp_t::ADD; - appDataType_t compTypeId = appDataType_t::appFloat; - appDataType_t outTypeId = appDataType_t::appFloat; + ReduceTensorOp reduceOp = ReduceTensorOp::ADD; + AppDataType compTypeId = AppDataType::appFloat; + AppDataType outTypeId = AppDataType::appFloat; bool compType_assigned = false; bool outType_assigned = false; - NanPropagation_t nanOpt = NanPropagation_t::NOT_PROPAGATE_NAN; - ReduceTensorIndices_t indicesOpt = ReduceTensorIndices_t::NO_INDICES; - bool do_log = false; - bool do_verification = false; - bool do_dumpout = false; + NanPropagation nanOpt = NanPropagation::NOT_PROPAGATE_NAN; + ReduceTensorIndices indicesOpt = ReduceTensorIndices::NO_INDICES; + bool do_log = false; + bool do_verification = false; + bool do_dumpout = false; int init_method; int nrepeat; @@ -213,33 +213,33 @@ class AppArgs if(!optarg) throw std::runtime_error("Invalid option format!"); - reduceOp = static_cast(std::atoi(optarg)); + reduceOp = static_cast(std::atoi(optarg)); break; case 'C': if(!optarg) throw std::runtime_error("Invalid option format!"); - compTypeId = static_cast(std::atoi(optarg)); + compTypeId = static_cast(std::atoi(optarg)); compType_assigned = true; break; case 'W': if(!optarg) throw std::runtime_error("Invalid option format!"); - outTypeId = static_cast(std::atoi(optarg)); + outTypeId = static_cast(std::atoi(optarg)); outType_assigned = true; break; case 'N': if(!optarg) throw std::runtime_error("Invalid option format!"); - nanOpt = static_cast(std::atoi(optarg)); + nanOpt = static_cast(std::atoi(optarg)); break; case 'I': if(!optarg) throw std::runtime_error("Invalid option format!"); - indicesOpt = static_cast(std::atoi(optarg)); + indicesOpt = static_cast(std::atoi(optarg)); break; case 'S': if(!optarg) @@ -303,10 +303,10 @@ class AppArgs scales.push_back(0.0f); }; - if(reduceOp == ReduceTensorOp_t::MIN || reduceOp == ReduceTensorOp_t::MAX || - reduceOp == ReduceTensorOp_t::AMAX) + if(reduceOp == ReduceTensorOp::MIN || reduceOp == ReduceTensorOp::MAX || + reduceOp == ReduceTensorOp::AMAX) { - if(indicesOpt != ReduceTensorIndices_t::NO_INDICES) + if(indicesOpt != ReduceTensorIndices::NO_INDICES) need_indices = true; // for indexable operations, no need to assign compType and outType, just let them be @@ -333,22 +333,22 @@ int profile_reduce(int argc, char* argv[]) check_reduce_dims(rank, args.reduceDims); - if(args.reduceOp == ReduceTensorOp_t::MUL || args.reduceOp == ReduceTensorOp_t::NORM1) + if(args.reduceOp == ReduceTensorOp::MUL || args.reduceOp == ReduceTensorOp::NORM1) throw std::runtime_error("MUL and NORM1 are not supported by composable kernel!"); if(args.use_half) { if(!args.compType_assigned) - args.compTypeId = appDataType_t::appHalf; + args.compTypeId = AppDataType::appHalf; if(args.outType_assigned && - (args.outTypeId != appDataType_t::appHalf && args.outTypeId != appDataType_t::appFloat)) - args.outTypeId = appDataType_t::appFloat; + (args.outTypeId != AppDataType::appHalf && args.outTypeId != AppDataType::appFloat)) + args.outTypeId = AppDataType::appFloat; if(!args.outType_assigned) - args.outTypeId = appDataType_t::appHalf; + args.outTypeId = AppDataType::appHalf; - if(args.compTypeId == appDataType_t::appHalf) + if(args.compTypeId == AppDataType::appHalf) { profile_reduce_impl(args.do_verification, args.init_method, @@ -363,7 +363,7 @@ int profile_reduce(int argc, char* argv[]) args.scales[0], args.scales[1]); } - else if(args.compTypeId == appDataType_t::appFloat) + else if(args.compTypeId == AppDataType::appFloat) { profile_reduce_impl(args.do_verification, args.init_method, @@ -399,16 +399,16 @@ int profile_reduce(int argc, char* argv[]) else if(args.use_int8) { if(!args.compType_assigned) - args.compTypeId = appDataType_t::appInt8; + args.compTypeId = AppDataType::appInt8; if(args.outType_assigned && - (args.outTypeId != appDataType_t::appInt8 && args.outTypeId != appDataType_t::appInt32)) - args.outTypeId = appDataType_t::appInt32; + (args.outTypeId != AppDataType::appInt8 && args.outTypeId != AppDataType::appInt32)) + args.outTypeId = AppDataType::appInt32; if(!args.outType_assigned) - args.outTypeId = appDataType_t::appInt8; + args.outTypeId = AppDataType::appInt8; - if(args.compTypeId == appDataType_t::appInt8) + if(args.compTypeId == AppDataType::appInt8) { profile_reduce_impl(args.do_verification, args.init_method, @@ -423,7 +423,7 @@ int profile_reduce(int argc, char* argv[]) args.scales[0], args.scales[1]); } - else if(args.compTypeId == appDataType_t::appInt32) + else if(args.compTypeId == AppDataType::appInt32) { profile_reduce_impl(args.do_verification, args.init_method, @@ -443,12 +443,12 @@ int profile_reduce(int argc, char* argv[]) } else if(args.use_bf16) { - if(args.outType_assigned && (args.outTypeId != appDataType_t::appBFloat16 && - args.outTypeId != appDataType_t::appFloat)) - args.outTypeId = appDataType_t::appFloat; + if(args.outType_assigned && + (args.outTypeId != AppDataType::appBFloat16 && args.outTypeId != AppDataType::appFloat)) + args.outTypeId = AppDataType::appFloat; if(!args.outType_assigned) - args.outTypeId = appDataType_t::appBFloat16; + args.outTypeId = AppDataType::appBFloat16; profile_reduce_impl(args.do_verification, args.init_method, @@ -465,7 +465,7 @@ int profile_reduce(int argc, char* argv[]) } else { - if(args.compTypeId == appDataType_t::appFloat) + if(args.compTypeId == AppDataType::appFloat) { profile_reduce_impl(args.do_verification, args.init_method, @@ -480,7 +480,7 @@ int profile_reduce(int argc, char* argv[]) args.scales[0], args.scales[1]); } - else if(args.compTypeId == appDataType_t::appDouble) + else if(args.compTypeId == AppDataType::appDouble) { profile_reduce_impl(args.do_verification, args.init_method, diff --git a/profiler/src/profiler.cpp b/profiler/src/profiler.cpp index 24e5ae7e3e..c0909ed5c1 100644 --- a/profiler/src/profiler.cpp +++ b/profiler/src/profiler.cpp @@ -85,26 +85,24 @@ int main(int argc, char* argv[]) { return profile_reduce(argc, argv); } - else - { - // clang-format off - printf("arg1: tensor operation (gemm: GEMM\n" - " gemm_bias_2d: GEMM+Bias(2D)\n" - " gemm_bias_relu: GEMM+Bias+ReLU\n" - " gemm_bias_relu_add: GEMM+Bias+ReLU+Add\n" - " gemm_reduce: GEMM+Reduce\n" - " grouped_gemm: Grouped Gemm\n" - " conv_fwd: ForwardConvolution\n" - " conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n" - " conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n" - " conv_fwd_bias_relu_atomic_add: ForwardConvolution+Bias+ReLU+AtomicAdd\n" - " conv1d_bwd_data: BackwardConvolution data 1 dim\n" - " conv2d_bwd_data: BackwardConvolution data 2 dim\n" - " conv3d_bwd_data: BackwardConvolution data 3 dim\n" - " grouped_gemm: Grouped Gemm\n" - " reduce: REDUCE\n"); - // clang-format on - return 0; - } + // clang-format off + printf("arg1: tensor operation (gemm: GEMM\n" + " gemm_bias_2d: GEMM+Bias(2D)\n" + " gemm_bias_relu: GEMM+Bias+ReLU\n" + " gemm_bias_relu_add: GEMM+Bias+ReLU+Add\n" + " gemm_reduce: GEMM+Reduce\n" + " grouped_gemm: Grouped GEMM\n" + " conv_fwd: ForwardConvolution\n" + " conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n" + " conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n" + " conv_fwd_bias_relu_atomic_add: ForwardConvolution+Bias+ReLU+AtomicAdd\n" + " conv1d_bwd_data: BackwardConvolution data 1d\n" + " conv2d_bwd_data: BackwardConvolution data 2d\n" + " conv3d_bwd_data: BackwardConvolution data 3d\n" + " grouped_gemm: Grouped GEMM\n" + " reduce: Reduce\n"); + // clang-format on + + return 0; } diff --git a/script/cmake-rocm.sh b/script/cmake-rocm.sh index 0e8424f940..5ba8820651 100755 --- a/script/cmake-rocm.sh +++ b/script/cmake-rocm.sh @@ -10,9 +10,11 @@ cmake -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \ -D BUILD_DEV=OFF \ -D CMAKE_BUILD_TYPE=Release \ --D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only " \ +-D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 --offload-arch=gfx90a -O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=$PWD" \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_PREFIX_PATH=/opt/rocm \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ ${MY_PROJECT_SOURCE} +#-D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 --offload-arch=gfx90a -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD" \ +#-D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 --offload-arch=gfx90a -O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=$PWD" \ diff --git a/test/include/conv_test_util.hpp b/test/include/conv_test_util.hpp index 2355e4be30..31bde8e99d 100644 --- a/test/include/conv_test_util.hpp +++ b/test/include/conv_test_util.hpp @@ -31,7 +31,7 @@ using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvFwdDefault = - ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; template using DeviceConvNDFwdInstance = ck::tensor_operation::device:: diff --git a/test/magic_number_division/magic_number_division.cpp b/test/magic_number_division/magic_number_division.cpp index ec53996349..267882e0cb 100644 --- a/test/magic_number_division/magic_number_division.cpp +++ b/test/magic_number_division/magic_number_division.cpp @@ -5,7 +5,7 @@ #include #include #include "config.hpp" -#include "print.hpp" +#include "magic_division.hpp" #include "device.hpp" #include "host_tensor.hpp" #include "host_tensor_generator.hpp" diff --git a/test/reduce/reduce_no_index.cpp b/test/reduce/reduce_no_index.cpp index e267dcc433..f031648881 100644 --- a/test/reduce/reduce_no_index.cpp +++ b/test/reduce/reduce_no_index.cpp @@ -51,11 +51,11 @@ struct type_mapping constexpr int Rank = 4; -constexpr ReduceTensorOp_t ReduceOpId = ReduceTensorOp_t::AVG; -constexpr NanPropagation_t NanOpt = NanPropagation_t::PROPAGATE_NAN; -constexpr bool PropagateNan = false; -constexpr ReduceTensorIndices_t IndicesOpt = ReduceTensorIndices_t::NO_INDICES; -constexpr bool NeedIndices = false; +constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AVG; +constexpr NanPropagation NanOpt = NanPropagation::PROPAGATE_NAN; +constexpr bool PropagateNan = false; +constexpr ReduceTensorIndices IndicesOpt = ReduceTensorIndices::NO_INDICES; +constexpr bool NeedIndices = false; template constexpr int Rank = 4; -constexpr ReduceTensorOp_t ReduceOpId = ReduceTensorOp_t::AMAX; -constexpr NanPropagation_t NanOpt = NanPropagation_t::PROPAGATE_NAN; -constexpr bool PropagateNan = false; -constexpr ReduceTensorIndices_t IndicesOpt = ReduceTensorIndices_t::FLATTENED_INDICES; -constexpr bool NeedIndices = true; +constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AMAX; +constexpr NanPropagation NanOpt = NanPropagation::PROPAGATE_NAN; +constexpr bool PropagateNan = false; +constexpr ReduceTensorIndices IndicesOpt = ReduceTensorIndices::FLATTENED_INDICES; +constexpr bool NeedIndices = true; template