From 6935a2481cc5acb196c3db010e2edaa4de717de4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Tue, 18 Jun 2024 22:05:30 +0200 Subject: [PATCH] Add read_first_lane function for int64 (#1347) [ROCm/composable_kernel commit: 8faec23cb431e38e4d08f6729a9a8f1e136dd7d5] --- ...nv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 9 ++++-- .../device_grouped_conv_bwd_weight_dl.hpp | 9 ++++-- ...onv_bwd_weight_multiple_d_xdl_cshuffle.hpp | 9 ++++-- ...conv_bwd_weight_two_stage_xdl_cshuffle.hpp | 18 +++++++---- ..._conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp | 9 ++++-- ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp | 17 +++++++---- ..._conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp | 30 ++++++++++++------- ...fwd_multiple_d_multiple_r_xdl_cshuffle.hpp | 6 ++-- ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 18 +++++++---- .../ck/utility/amd_wave_read_first_lane.hpp | 24 ++++++++++++++- 10 files changed, 105 insertions(+), 44 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp index 409e8c7b8b..5e9da459c0 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp @@ -93,9 +93,12 @@ __global__ void __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); - const long_index_t a_batch_offset = compute_ptr_offset_of_batch.GetAPtrOffset(g_idx); - const long_index_t b_batch_offset = compute_ptr_offset_of_batch.GetBPtrOffset(g_idx); - const long_index_t e_batch_offset = compute_ptr_offset_of_batch.GetEPtrOffset(g_idx); + const long_index_t a_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); + const long_index_t b_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); + const long_index_t e_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)); const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp index 83db2485a1..86091aeba9 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp @@ -54,9 +54,12 @@ __global__ void __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); - const long_index_t a_batch_offset = compute_ptr_offset_of_batch.GetAPtrOffset(g_idx); - const long_index_t b_batch_offset = compute_ptr_offset_of_batch.GetBPtrOffset(g_idx); - const long_index_t c_batch_offset = compute_ptr_offset_of_batch.GetCPtrOffset(g_idx); + const long_index_t a_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); + const long_index_t b_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); + const long_index_t c_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)); __shared__ FloatAB p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB)]; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp index 380a06e0d8..7f88ea692a 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp @@ -66,9 +66,12 @@ __global__ void __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); - const long_index_t a_batch_offset = compute_ptr_offset_of_batch.GetAPtrOffset(g_idx); - const long_index_t b_batch_offset = compute_ptr_offset_of_batch.GetBPtrOffset(g_idx); - const long_index_t c_batch_offset = compute_ptr_offset_of_batch.GetCPtrOffset(g_idx); + const long_index_t a_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); + const long_index_t b_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); + const long_index_t c_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)); __shared__ FloatA p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatA)]; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp index 963f3f254c..f4f496fc10 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp @@ -59,9 +59,12 @@ __global__ void const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumBatchToMerge); const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block); - const long_index_t a_batch_offset = compute_ptr_offset_of_batch.GetAPtrOffset(g_idx); - const long_index_t b_batch_offset = compute_ptr_offset_of_batch.GetBPtrOffset(g_idx); - const long_index_t e_batch_offset = compute_ptr_offset_of_batch.GetEPtrOffset(g_idx); + const long_index_t a_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); + const long_index_t b_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); + const long_index_t e_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)); __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; @@ -113,9 +116,12 @@ __global__ void const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumBatchToMerge); const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block); - const long_index_t a_batch_offset = compute_ptr_offset_of_batch.GetAPtrOffset(g_idx); - const long_index_t b_batch_offset = compute_ptr_offset_of_batch.GetBPtrOffset(g_idx); - const long_index_t e_batch_offset = compute_ptr_offset_of_batch.GetEPtrOffset(g_idx); + const long_index_t a_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); + const long_index_t b_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); + const long_index_t e_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)); // Pass two lds pointer is the key to tell compiler that ds_read/write // operate on different lds chunk at same time without order dependecy diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp index 3bb53920b2..ce86ec54e5 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp @@ -97,9 +97,12 @@ __global__ void __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); - const long_index_t a_batch_offset = compute_ptr_offset_of_batch.GetAPtrOffset(g_idx); - const long_index_t b_batch_offset = compute_ptr_offset_of_batch.GetBPtrOffset(g_idx); - const long_index_t c_batch_offset = compute_ptr_offset_of_batch.GetEPtrOffset(g_idx); + const long_index_t a_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); + const long_index_t b_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); + const long_index_t c_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)); const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp index 88fe38adde..f5a8d4e9f7 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp @@ -106,10 +106,12 @@ __global__ void const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / num_blocks_per_batch); const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / num_blocks_per_n); - const long_index_t e_batch_offset = compute_ptr_offset_of_groups.GetEPtrOffset(g_idx); - const auto& ds_batch_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx); + const long_index_t e_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx)); + const auto& ds_batch_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx); - const long_index_t e_n_offset = compute_ptr_offset_of_n.GetEPtrOffset(n_idx); + const long_index_t e_n_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx)); __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; @@ -170,10 +172,13 @@ __global__ void } else { - const long_index_t a_batch_offset = compute_ptr_offset_of_groups.GetAPtrOffset(g_idx); - const long_index_t b_batch_offset = compute_ptr_offset_of_groups.GetBPtrOffset(g_idx); + const long_index_t a_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx)); + const long_index_t b_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx)); - const long_index_t a_n_offset = compute_ptr_offset_of_n.GetAPtrOffset(n_idx); + const long_index_t a_n_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx)); GridwiseGemm::template Run( p_as_grid + a_batch_offset + a_n_offset, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp index ba9d967e97..415ae3d496 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp @@ -85,12 +85,17 @@ __global__ void const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / num_blocks_per_batch); const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / num_blocks_per_n); - const long_index_t a_batch_offset = compute_ptr_offset_of_groups.GetAPtrOffset(g_idx); - const long_index_t b_batch_offset = compute_ptr_offset_of_groups.GetBPtrOffset(g_idx); - const long_index_t e_batch_offset = compute_ptr_offset_of_groups.GetEPtrOffset(g_idx); + const long_index_t a_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx)); + const long_index_t b_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx)); + const long_index_t e_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx)); - const long_index_t a_n_offset = compute_ptr_offset_of_n.GetAPtrOffset(n_idx); - const long_index_t e_n_offset = compute_ptr_offset_of_n.GetEPtrOffset(n_idx); + const long_index_t a_n_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx)); + const long_index_t e_n_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx)); __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; @@ -142,12 +147,17 @@ __global__ void const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / num_blocks_per_batch); const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / num_blocks_per_n); - const long_index_t a_batch_offset = compute_ptr_offset_of_groups.GetAPtrOffset(g_idx); - const long_index_t b_batch_offset = compute_ptr_offset_of_groups.GetBPtrOffset(g_idx); - const long_index_t e_batch_offset = compute_ptr_offset_of_groups.GetEPtrOffset(g_idx); + const long_index_t a_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx)); + const long_index_t b_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx)); + const long_index_t e_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx)); - const long_index_t a_n_offset = compute_ptr_offset_of_n.GetAPtrOffset(n_idx); - const long_index_t e_n_offset = compute_ptr_offset_of_n.GetEPtrOffset(n_idx); + const long_index_t a_n_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx)); + const long_index_t e_n_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx)); // Pass two lds pointer is the key to tell compiler that ds_read/write // operate on different lds chunk at same time without order dependecy diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp index 114fcbfcff..2170a5829a 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp @@ -161,11 +161,11 @@ __global__ void __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); - const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane( + const long_index_t a_batch_offset = amd_wave_read_first_lane( static_cast(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx))); - const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane( + const long_index_t b_batch_offset = amd_wave_read_first_lane( static_cast(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx))); - const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane( + const long_index_t e_batch_offset = amd_wave_read_first_lane( static_cast(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx))); const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index dc639e995e..49a6dc3b0f 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -60,9 +60,12 @@ __global__ void __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); - const long_index_t a_batch_offset = compute_ptr_offset_of_batch.GetAPtrOffset(g_idx); - const long_index_t b_batch_offset = compute_ptr_offset_of_batch.GetBPtrOffset(g_idx); - const long_index_t e_batch_offset = compute_ptr_offset_of_batch.GetEPtrOffset(g_idx); + const long_index_t a_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); + const long_index_t b_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); + const long_index_t e_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)); const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); @@ -152,9 +155,12 @@ __global__ void __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); - const long_index_t a_batch_offset = compute_ptr_offset_of_batch.GetAPtrOffset(g_idx); - const long_index_t b_batch_offset = compute_ptr_offset_of_batch.GetBPtrOffset(g_idx); - const long_index_t e_batch_offset = compute_ptr_offset_of_batch.GetEPtrOffset(g_idx); + const long_index_t a_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); + const long_index_t b_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); + const long_index_t e_batch_offset = + amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)); const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); diff --git a/include/ck/utility/amd_wave_read_first_lane.hpp b/include/ck/utility/amd_wave_read_first_lane.hpp index 741b2975af..d6e1eab314 100644 --- a/include/ck/utility/amd_wave_read_first_lane.hpp +++ b/include/ck/utility/amd_wave_read_first_lane.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -95,11 +95,33 @@ using get_carrier_t = typename get_carrier::type; } // namespace detail +__device__ inline uint32_t amd_wave_read_first_lane(uint32_t value) +{ + return __builtin_amdgcn_readfirstlane(value); +} + __device__ inline int32_t amd_wave_read_first_lane(int32_t value) { return __builtin_amdgcn_readfirstlane(value); } +__device__ inline int64_t amd_wave_read_first_lane(int64_t value) +{ + constexpr unsigned object_size = sizeof(int64_t); + constexpr unsigned second_part_offset = object_size / 2; + auto* const from_obj = reinterpret_cast(&value); + alignas(int64_t) std::byte to_obj[object_size]; + + using Sgpr = uint32_t; + + *reinterpret_cast(to_obj) = + amd_wave_read_first_lane(*reinterpret_cast(from_obj)); + *reinterpret_cast(to_obj + second_part_offset) = + amd_wave_read_first_lane(*reinterpret_cast(from_obj + second_part_offset)); + + return *reinterpret_cast(to_obj); +} + template < typename Object, typename = std::enable_if_t && std::is_trivially_copyable_v>>