From 291c6fef56e8ed26b676eab30da46867af9c8bcb Mon Sep 17 00:00:00 2001 From: kiefer Date: Mon, 15 Dec 2025 16:27:17 +0000 Subject: [PATCH] Small post-merge fixes --- ...conv_fwd_multiple_abd_wmma_cshuffle_v3.hpp | 4 +- .../grid/gridwise_gemm_wmma_cshuffle_v3.hpp | 6 ++ .../test_grouped_convnd_fwd_scaleadd_ab.cpp | 57 +++++++++++-------- 3 files changed, 42 insertions(+), 25 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_wmma_cshuffle_v3.hpp index a0a84fa0a0..df128c10b9 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_wmma_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_wmma_cshuffle_v3.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -454,6 +454,7 @@ struct DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3 BComputeDataType, false, // PermuteA false, // PermuteB + false, // IsBPreShuffled true>; // ForceThreadTileTransfer // TODO: Previously available template param DoElementwiseBeforeCShuffle! @@ -527,6 +528,7 @@ struct DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3 false, // PermuteB false, // PermuteA + false, // IsBPreShuffled true>; // ForceThreadTileTransfer using GridwiseGemmCTranspose = diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp index e98657f079..79d2b3aefe 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp @@ -865,6 +865,10 @@ struct GridwiseGemm_wmma_cshuffle_v3 const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[I0]); const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[I1]); + // AScale struct (Empty) + using AScale = typename BlockwiseGemmPipe::Empty; + auto a_scale_struct = AScale{}; + // BScale struct (Empty) using BScale = typename BlockwiseGemmPipe::Empty; auto b_scale_struct = BScale{}; @@ -875,6 +879,7 @@ struct GridwiseGemm_wmma_cshuffle_v3 decltype(bs_grid_desc_bk0_n_bk1), decltype(ds_grid_desc_mblock_mperblock_nblock_nperblock), decltype(e_grid_desc_mblock_mperblock_nblock_nperblock), + decltype(a_scale_struct), decltype(b_scale_struct), decltype(epilogue_args), HasMainKBlockLoop, @@ -894,6 +899,7 @@ struct GridwiseGemm_wmma_cshuffle_v3 block_m_id, block_n_id, num_k_block_per_scale, + a_scale_struct, b_scale_struct, epilogue_args); } diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_scaleadd_ab.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_scaleadd_ab.cpp index e25d1b993b..ab7a28a388 100644 --- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_scaleadd_ab.cpp +++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_scaleadd_ab.cpp @@ -1,3 +1,6 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + #include #include #include @@ -89,12 +92,12 @@ bool profile_grouped_conv_fwd_scaleadd_ab_impl(int do_verification, copy(conv_param.input_left_pads_, input_left_pads); copy(conv_param.input_right_pads_, input_right_pads); - Tensor input(in_g_n_c_wis_desc); - Tensor input_bias(in_g_n_c_wis_desc); - Tensor weight(wei_g_k_c_xs_desc); - Tensor weight_bias(wei_g_k_c_xs_desc); - Tensor host_output(out_g_n_k_wos_desc); - Tensor device_output(out_g_n_k_wos_desc); + ck::Tensor input(in_g_n_c_wis_desc); + ck::Tensor input_bias(in_g_n_c_wis_desc); + ck::Tensor weight(wei_g_k_c_xs_desc); + ck::Tensor weight_bias(wei_g_k_c_xs_desc); + ck::Tensor host_output(out_g_n_k_wos_desc); + ck::Tensor device_output(out_g_n_k_wos_desc); std::cout << "input: " << input.mDesc << std::endl; std::cout << "weight: " << weight.mDesc << std::endl; @@ -116,11 +119,12 @@ bool profile_grouped_conv_fwd_scaleadd_ab_impl(int do_verification, weight_bias.GenerateTensorValue(GeneratorTensor_3{-1.0, 1.0}); } - DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize()); - DeviceMem in_bias_device_buf(sizeof(InDataType) * input_bias.mDesc.GetElementSpaceSize()); - DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize()); - DeviceMem wei_bias_device_buf(sizeof(WeiDataType) * weight_bias.mDesc.GetElementSpaceSize()); - DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize()); + ck::DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize()); + ck::DeviceMem in_bias_device_buf(sizeof(InDataType) * input_bias.mDesc.GetElementSpaceSize()); + ck::DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize()); + ck::DeviceMem wei_bias_device_buf(sizeof(WeiDataType) * + weight_bias.mDesc.GetElementSpaceSize()); + ck::DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize()); in_device_buf.ToDevice(input.mData.data()); in_bias_device_buf.ToDevice(input_bias.mData.data()); @@ -130,8 +134,8 @@ bool profile_grouped_conv_fwd_scaleadd_ab_impl(int do_verification, // Run reference op if(do_verification) { - const std::array, NumAs - 1> elementwise_a_tensors = {input_bias}; - const std::array, NumBs - 1> elementwise_b_tensors = {weight_bias}; + const std::array, NumAs - 1> elementwise_a_tensors = {input_bias}; + const std::array, NumBs - 1> elementwise_b_tensors = {weight_bias}; auto ref_conv = ck::tensor_operation::host::ReferenceConvFwdGetWorkSpaceSize(argument_ptr.get()); - DeviceMem workspace_dev(workspace_sz); + ck::DeviceMem workspace_dev(workspace_sz); op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer()); if(op_ptr->IsSupportedArgument(argument_ptr.get())) @@ -222,16 +226,21 @@ bool profile_grouped_conv_fwd_scaleadd_ab_impl(int do_verification, if(do_log) { - LogRangeAsType(std::cout << "input : ", input.mData, ",") << std::endl; - LogRangeAsType(std::cout << "input_bias: ", input_bias.mData, ",") - << std::endl; - LogRangeAsType(std::cout << "weight: ", weight.mData, ",") << std::endl; - LogRangeAsType(std::cout << "weight_bias: ", weight_bias.mData, ",") - << std::endl; - LogRangeAsType(std::cout << "host_output : ", host_output.mData, ",") - << std::endl; - LogRangeAsType(std::cout << "device_output: ", device_output.mData, ",") - << std::endl; + printf("log\n"); + // LogRangeAsType(std::cout << "input : ", input.mData, ",") << + // std::endl; LogRangeAsType(std::cout << "input_bias: ", + // input_bias.mData, ",") + // << std::endl; + // LogRangeAsType(std::cout << "weight: ", weight.mData, ",") << + // std::endl; LogRangeAsType(std::cout << "weight_bias: ", + // weight_bias.mData, ",") + // << std::endl; + // LogRangeAsType(std::cout << "host_output : ", host_output.mData, + // ",") + // << std::endl; + // LogRangeAsType(std::cout << "device_output: ", + // device_output.mData, ",") + // << std::endl; } } }