diff --git a/example/ck_tile/01_fmha/CMakeLists.txt b/example/ck_tile/01_fmha/CMakeLists.txt index a2d7fe4aaf..b8ca26193d 100644 --- a/example/ck_tile/01_fmha/CMakeLists.txt +++ b/example/ck_tile/01_fmha/CMakeLists.txt @@ -47,7 +47,7 @@ set(FMHA_BWD_CODE_GEN_COMMON_ARGS ${CMAKE_CURRENT_LIST_DIR}/generate.py --api bwd --receipt 3 - --optdim 32,64,128,256 + --optdim 32,64,96,128,256 # --filter fmha_bwd_dot...@fmha_bwd_convert...@fmha_bwd... ) diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py index 4df99a9a10..36482e94c1 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py @@ -380,6 +380,7 @@ def get_dq_dk_dv_tiles(dtype : str, tr_load: str) -> List[FmhaBwdDQDKDVTileSize] return [ FmhaBwdDQDKDVTileSize( 32, 128, 32, 32, 32, 32, 64, 32, 32, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1), FmhaBwdDQDKDVTileSize( 32, 128, 64, 32, 64, 32, 32, 64, 64, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1), + FmhaBwdDQDKDVTileSize( 32, 128, 96, 32, 96, 32, 32, 96, 96, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1), FmhaBwdDQDKDVTileSize( 16, 128, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1), # FmhaBwdDQDKDVTileSize( 32, 64, 160, 32, 160, 32, 32, 160, 160, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1), FmhaBwdDQDKDVTileSize( 16, 64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1), diff --git a/example/ck_tile/01_fmha/script/run_full_test.sh b/example/ck_tile/01_fmha/script/run_full_test.sh index e7babd2744..5c2a5a4b3d 100755 --- a/example/ck_tile/01_fmha/script/run_full_test.sh +++ b/example/ck_tile/01_fmha/script/run_full_test.sh @@ -34,15 +34,15 @@ function print_log_header(){ } #run verification tests -example/ck_tile/01_fmha/script/smoke_test_fwd.sh -example/ck_tile/01_fmha/script/smoke_test_bwd.sh +time example/ck_tile/01_fmha/script/smoke_test_fwd.sh +time example/ck_tile/01_fmha/script/smoke_test_bwd.sh #run performance benchmarks export fmha_fwd_log="perf_fmha_fwd_$GPU_arch.log" print_log_header $fmha_fwd_log $env_type $branch $host_name -example/ck_tile/01_fmha/script/benchmark_fwd.sh 2>&1 | tee -a $fmha_fwd_log +time example/ck_tile/01_fmha/script/benchmark_fwd.sh 2>&1 | tee -a $fmha_fwd_log export fmha_bwd_log="perf_fmha_bwd_$GPU_arch.log" print_log_header $fmha_bwd_log $env_type $branch $host_name -example/ck_tile/01_fmha/script/benchmark_bwd.sh 2>&1 | tee -a $fmha_bwd_log +time example/ck_tile/01_fmha/script/benchmark_bwd.sh 2>&1 | tee -a $fmha_bwd_log diff --git a/example/ck_tile/01_fmha/script/smoke_test_bwd.sh b/example/ck_tile/01_fmha/script/smoke_test_bwd.sh index 3b59505ff0..ec2a6a0ceb 100755 --- a/example/ck_tile/01_fmha/script/smoke_test_bwd.sh +++ b/example/ck_tile/01_fmha/script/smoke_test_bwd.sh @@ -6,7 +6,7 @@ SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd) EXE_NAME=tile_example_fmha_bwd EXE="$(find . -name $EXE_NAME -type f | head -n 1)" KNAME=1 -GPU_arch=$GPU_arch +GPU_arch=${GPU_arch:-""} if [ -z "$GPU_arch" ] ; then GPU_arch=$(rocminfo | grep -E 'Name:\s+gfx' | head -n1 | awk '{print $2}') fi @@ -31,7 +31,17 @@ run_exe() { set -ex } +test_h_s_mask() { + run_exe -b=1 -h=4 -h_k=2 -s=259 $@ + run_exe -b=2 -h=2 -s=516 -s_k=253 $@ + run_exe -b=1 -h=4 -h_k=1 -s=500 -s_k=251 -mask=1 $@ + run_exe -b=1 -h=2 -s=900 -s_k=258 -mask=2 $@ + run_exe -b=2 -h=1 -s=987 -s_k=219 -mask=t:128,30 $@ + run_exe -b=2 -h=3 -h_k=1 -s=244 -s_k=499 -mask=b:4,35 $@ +} + set -x +# main tests for prec in "fp16" "bf16" ; do for perm in 0 1 ; do for hdim in 32 64 128 256 ; do @@ -40,21 +50,21 @@ for bias in "n" "a" ; do for dbias in 0 ; do for p_drop in 0.0 0.2 ; do for deterministic in 0 ; do +test_h_s_mask -prec=$prec -d=$hdim -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS +done +done +done +done +done +done +done +done -run_exe -prec=$prec -b=1 -h=4 -h_k=2 -d=$hdim -s=259 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS -run_exe -prec=$prec -b=2 -h=2 -d=$hdim -s=516 -s_k=253 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS -run_exe -prec=$prec -b=1 -h=4 -h_k=1 -d=$hdim -s=500 -s_k=251 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=1 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS -run_exe -prec=$prec -b=1 -h=2 -d=$hdim -s=900 -s_k=258 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=2 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS -run_exe -prec=$prec -b=2 -h=1 -d=$hdim -s=987 -s_k=219 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=t:128,30 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS -run_exe -prec=$prec -b=2 -h=3 -h_k=1 -d=$hdim -s=244 -s_k=499 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=b:4,35 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS - -done -done -done -done -done -done -done +# additional cases +for hdim in 72 96 ; do +test_h_s_mask -prec=fp16 -d=$hdim -bias=a -dbias=0 -p_drop=0.2 -iperm=0 -operm=0 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS +test_h_s_mask -prec=bf16 -d=$hdim -bias=n -dbias=0 -p_drop=0 -iperm=1 -operm=1 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS +test_h_s_mask -prec=bf16 -d=$hdim -bias=a -dbias=0 -p_drop=0.2 -iperm=1 -operm=1 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS done set +x diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp index 4f9362beb2..fa914a7119 100644 --- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp +++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp @@ -11,199 +11,14 @@ #include "ck_tile/host.hpp" #include "grouped_convolution_utils.hpp" - -template , - typename DsLayout = ck_tile::tuple<>, - typename CDEElementWise = ck_tile::element_wise::PassThrough> -float grouped_conv_bwd_data(const ck_tile::GroupedConvBwdDataHostArgs& args, - const ck_tile::stream_config& s) -{ - constexpr int kBlockPerCu = 1; - - constexpr ck_tile::index_t M_Tile = 64; - constexpr ck_tile::index_t N_Tile = 64; - constexpr ck_tile::index_t K_Tile = 32; - - constexpr ck_tile::index_t M_Warp = 2; - constexpr ck_tile::index_t N_Warp = 2; - constexpr ck_tile::index_t K_Warp = 1; - - constexpr ck_tile::index_t M_Warp_Tile = GemmWarpConfig::M_Warp_Tile; - constexpr ck_tile::index_t N_Warp_Tile = GemmWarpConfig::N_Warp_Tile; - constexpr ck_tile::index_t K_Warp_Tile = GemmWarpConfig::K_Warp_Tile; - - constexpr ck_tile::index_t VectorSizeA = 1; - constexpr ck_tile::index_t VectorSizeB = 1; - constexpr ck_tile::index_t VectorSizeC = 8; - - // Implicit GEMM Traits - using CodegenShape = - ck_tile::TileGemmShape, - ck_tile::sequence, - ck_tile::sequence>; - - constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default; - using TilePartitioner = ck_tile::GemmTile1DPartitioner; - using GroupedConvTraitsType = ck_tile::GroupedConvTraits; - using CodegenPipelineProblem = ck_tile::GemmPipelineProblem< - InDataType, - WeiDataType, - AccDataType, - CodegenShape, - typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData, - ck_tile::element_wise::PassThrough, - ck_tile::element_wise::PassThrough, - InDataType, - true, - GroupedConvTraitsType::VectorSizeA, - GroupedConvTraitsType::VectorSizeB>; - using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1; - - const auto Run = [&](const auto memory_operation_) { - constexpr auto memory_operation = memory_operation_.value; - - using ConvEpilogue = ck_tile::CShuffleEpilogue< - ck_tile::CShuffleEpilogueProblem>; - - using Kernel = ck_tile::GroupedConvolutionBackwardDataKernel; - auto kargs = Kernel::MakeKernelArgs(args); - - const dim3 grids = Kernel::GridSize(args); - const dim3 blocks = Kernel::BlockSize(); - - if(!Kernel::IsSupportedArgument(kargs)) - { - throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n"); - } - - if(s.log_level_ > 0) - { - std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n' - << "shape: " << CodegenShape::GetName() << '\n' - << "problem: " << CodegenPipelineProblem::GetName() << '\n' - << "pipeline: " << CodegenPipeline::GetName() << '\n' - << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" - << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" - << '\n' - << "Vector size A: " << CodegenPipeline::GetVectorSizeA() - << ", Vector size B: " << CodegenPipeline::GetVectorSizeB() - << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl; - } - - float ave_time = ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); - - return ave_time; - }; - - if(args.k_batch == 1) - { - return Run(ck_tile::integral_constant{}); - } - else - { - return Run(ck_tile::integral_constant{}); - } -} - +#include "grouped_convolution_backward_data_invoker.hpp" #include "run_grouped_convolution_bwd_data_example.inc" -template -int run_grouped_conv_bwd_data_example_prec_type( - std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[]) -{ - using NWGC = ck_tile::tensor_layout::convolution::NWGC; - using NHWGC = ck_tile::tensor_layout::convolution::NHWGC; - using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC; - - using GKXC = ck_tile::tensor_layout::convolution::GKXC; - using GKYXC = ck_tile::tensor_layout::convolution::GKYXC; - using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC; - - using NWGK = ck_tile::tensor_layout::convolution::NWGK; - using NHWGK = ck_tile::tensor_layout::convolution::NHWGK; - using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK; - - if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK") - { - return run_grouped_conv_bwd_data_example_with_layouts{}, - GemmWarpConfig, - InPrecType, - WeiPrecType, - OutPrecType>( - argc, argv, NWGC{}, GKXC{}, NWGK{}); - } - else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK") - { - return run_grouped_conv_bwd_data_example_with_layouts{}, - GemmWarpConfig, - InPrecType, - WeiPrecType, - OutPrecType>( - argc, argv, NHWGC{}, GKYXC{}, NHWGK{}); - } - else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "NDHWGK") - { - return run_grouped_conv_bwd_data_example_with_layouts{}, - GemmWarpConfig, - InPrecType, - WeiPrecType, - OutPrecType>( - argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{}); - } - else - { - throw std::runtime_error("Unsupported memory layout!"); - } -} - template int run_grouped_conv_bwd_data_example(int argc, char* argv[]) { + using Invoker = GroupedConvolutionBackwardDataInvoker; + auto [result, arg_parser] = create_args(argc, argv); if(!result) return -1; @@ -215,12 +30,16 @@ int run_grouped_conv_bwd_data_example(int argc, char* argv[]) if(data_type == "fp16") { - return run_grouped_conv_bwd_data_example_prec_type( + return run_grouped_conv_bwd_data_example_prec_type( in_layout, wei_layout, out_layout, argc, argv); } else if(data_type == "bf16") { - return run_grouped_conv_bwd_data_example_prec_type( + return run_grouped_conv_bwd_data_example_prec_type( in_layout, wei_layout, out_layout, argc, argv); } else diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data_invoker.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data_invoker.hpp new file mode 100644 index 0000000000..1b3d45427d --- /dev/null +++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data_invoker.hpp @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +#pragma once + +#include "grouped_convolution_utils.hpp" + +struct GroupedConvolutionBackwardDataInvoker +{ + + template , + typename DsLayout = ck_tile::tuple<>, + typename CDEElementWise = ck_tile::element_wise::PassThrough> + static float grouped_conv_bwd_data(const ck_tile::GroupedConvBwdDataHostArgs& args, + const ck_tile::stream_config& s) + { + constexpr int kBlockPerCu = 1; + + constexpr ck_tile::index_t M_Tile = 64; + constexpr ck_tile::index_t N_Tile = 64; + constexpr ck_tile::index_t K_Tile = 32; + + constexpr ck_tile::index_t M_Warp = 2; + constexpr ck_tile::index_t N_Warp = 2; + constexpr ck_tile::index_t K_Warp = 1; + + constexpr ck_tile::index_t M_Warp_Tile = GemmWarpConfig::M_Warp_Tile; + constexpr ck_tile::index_t N_Warp_Tile = GemmWarpConfig::N_Warp_Tile; + constexpr ck_tile::index_t K_Warp_Tile = GemmWarpConfig::K_Warp_Tile; + + constexpr ck_tile::index_t VectorSizeA = 1; + constexpr ck_tile::index_t VectorSizeB = 1; + constexpr ck_tile::index_t VectorSizeC = 8; + + // Implicit GEMM Traits + using CodegenShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence>; + + constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default; + using TilePartitioner = ck_tile::GemmTile1DPartitioner; + using GroupedConvTraitsType = ck_tile::GroupedConvTraits; + using CodegenPipelineProblem = ck_tile::GemmPipelineProblem< + InDataType, + WeiDataType, + AccDataType, + CodegenShape, + typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + InDataType, + true, + GroupedConvTraitsType::VectorSizeA, + GroupedConvTraitsType::VectorSizeB>; + using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1; + + const auto Run = [&](const auto memory_operation_) { + constexpr auto memory_operation = memory_operation_.value; + + using ConvEpilogue = ck_tile::CShuffleEpilogue>; + + using Kernel = ck_tile::GroupedConvolutionBackwardDataKernel; + auto kargs = Kernel::MakeKernelArgs(args); + + const dim3 grids = Kernel::GridSize(args); + const dim3 blocks = Kernel::BlockSize(); + + if(!Kernel::IsSupportedArgument(kargs)) + { + throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n"); + } + + if(s.log_level_ > 0) + { + std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n' + << "shape: " << CodegenShape::GetName() << '\n' + << "problem: " << CodegenPipelineProblem::GetName() << '\n' + << "pipeline: " << CodegenPipeline::GetName() << '\n' + << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z + << "}" << '\n' + << "Vector size A: " << CodegenPipeline::GetVectorSizeA() + << ", Vector size B: " << CodegenPipeline::GetVectorSizeB() + << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl; + } + + float ave_time = ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + + return ave_time; + }; + + if(args.k_batch == 1) + { + return Run(ck_tile::integral_constant{}); + } + else + { + return Run(ck_tile::integral_constant{}); + } + } +}; diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp index cebfa90579..4cddbae3ab 100644 --- a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp +++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp @@ -11,190 +11,14 @@ #include "ck_tile/host.hpp" #include "grouped_convolution_utils.hpp" - -template , - typename DsLayout = ck_tile::tuple<>, - typename CDEElementWise = ck_tile::element_wise::PassThrough> -float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args, const ck_tile::stream_config& s) -{ - constexpr int kBlockPerCu = 1; - - constexpr ck_tile::index_t M_Tile = 64; - constexpr ck_tile::index_t N_Tile = 64; - constexpr ck_tile::index_t K_Tile = 64; - - constexpr ck_tile::index_t M_Warp = 2; - constexpr ck_tile::index_t N_Warp = 2; - constexpr ck_tile::index_t K_Warp = 1; - - constexpr ck_tile::index_t M_Warp_Tile = GemmWarpConfig::M_Warp_Tile; - constexpr ck_tile::index_t N_Warp_Tile = GemmWarpConfig::N_Warp_Tile; - constexpr ck_tile::index_t K_Warp_Tile = GemmWarpConfig::K_Warp_Tile; - - constexpr ck_tile::index_t VectorSizeA = 8; - constexpr ck_tile::index_t VectorSizeB = 8; - constexpr ck_tile::index_t VectorSizeC = 8; - - // Implicit GEMM Traits - using CodegenShape = - ck_tile::TileGemmShape, - ck_tile::sequence, - ck_tile::sequence>; - - constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default; - using TilePartitioner = ck_tile::GemmTile1DPartitioner; - using GroupedConvTraitsType = ck_tile::GroupedConvTraits; - using CodegenPipelineProblem = ck_tile::GemmPipelineProblem< - InDataType, - WeiDataType, - AccDataType, - CodegenShape, - typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsFwd, - ck_tile::element_wise::PassThrough, - ck_tile::element_wise::PassThrough, - InDataType, - true, - GroupedConvTraitsType::VectorSizeA, - GroupedConvTraitsType::VectorSizeB>; - using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1; - - const auto Run = [&](const auto memory_operation_) { - constexpr auto memory_operation = memory_operation_.value; - - using ConvEpilogue = ck_tile::CShuffleEpilogue< - ck_tile::CShuffleEpilogueProblem>; - - using Kernel = ck_tile::GroupedConvolutionForwardKernel; - auto kargs = Kernel::MakeKernelArgs(args); - - const dim3 grids = Kernel::GridSize(kargs); - const dim3 blocks = Kernel::BlockSize(); - - if(!Kernel::IsSupportedArgument(kargs)) - { - throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n"); - } - - if(s.log_level_ > 0) - { - std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n' - << "shape: " << CodegenShape::GetName() << '\n' - << "problem: " << CodegenPipelineProblem::GetName() << '\n' - << "pipeline: " << CodegenPipeline::GetName() << '\n' - << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" - << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" - << '\n' - << "Vector size A: " << CodegenPipeline::GetVectorSizeA() - << ", Vector size B: " << CodegenPipeline::GetVectorSizeB() - << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl; - } - - float ave_time = ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); - - return ave_time; - }; - - return Run(ck_tile::integral_constant{}); -} - +#include "grouped_convolution_forward_invoker.hpp" #include "run_grouped_convolution_fwd_example.inc" -template -int run_grouped_conv_fwd_example_prec_type( - std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[]) -{ - using NWGC = ck_tile::tensor_layout::convolution::NWGC; - using NHWGC = ck_tile::tensor_layout::convolution::NHWGC; - using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC; - - using GKXC = ck_tile::tensor_layout::convolution::GKXC; - using GKYXC = ck_tile::tensor_layout::convolution::GKYXC; - using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC; - - using NWGK = ck_tile::tensor_layout::convolution::NWGK; - using NHWGK = ck_tile::tensor_layout::convolution::NHWGK; - using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK; - - if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK") - { - return run_grouped_conv_fwd_example_with_layouts{}, - GemmWarpConfig, - InPrecType, - WeiPrecType, - OutPrecType>( - argc, argv, NWGC{}, GKXC{}, NWGK{}); - } - else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK") - { - return run_grouped_conv_fwd_example_with_layouts{}, - GemmWarpConfig, - InPrecType, - WeiPrecType, - OutPrecType>( - argc, argv, NHWGC{}, GKYXC{}, NHWGK{}); - } - else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "GKZYXC") - { - return run_grouped_conv_fwd_example_with_layouts{}, - GemmWarpConfig, - InPrecType, - WeiPrecType, - OutPrecType>( - argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{}); - } - else - { - throw std::runtime_error("Unsupported memory layout!"); - } -} - template int run_grouped_conv_fwd_example(int argc, char* argv[]) { + using Invoker = GroupedConvolutionForwardInvoker; + auto [result, arg_parser] = create_args(argc, argv); if(!result) return -1; @@ -206,12 +30,12 @@ int run_grouped_conv_fwd_example(int argc, char* argv[]) if(data_type == "fp16") { - return run_grouped_conv_fwd_example_prec_type( + return run_grouped_conv_fwd_example_prec_type( in_layout, wei_layout, out_layout, argc, argv); } else if(data_type == "bf16") { - return run_grouped_conv_fwd_example_prec_type( + return run_grouped_conv_fwd_example_prec_type( in_layout, wei_layout, out_layout, argc, argv); } else diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_invoker.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_invoker.hpp new file mode 100644 index 0000000000..0b9879d247 --- /dev/null +++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_invoker.hpp @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +#pragma once + +#include "grouped_convolution_utils.hpp" + +struct GroupedConvolutionForwardInvoker +{ + template , + typename DsLayout = ck_tile::tuple<>, + typename CDEElementWise = ck_tile::element_wise::PassThrough> + static float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args, + const ck_tile::stream_config& s) + { + constexpr int kBlockPerCu = 1; + + constexpr ck_tile::index_t M_Tile = 64; + constexpr ck_tile::index_t N_Tile = 64; + constexpr ck_tile::index_t K_Tile = 64; + + constexpr ck_tile::index_t M_Warp = 2; + constexpr ck_tile::index_t N_Warp = 2; + constexpr ck_tile::index_t K_Warp = 1; + + constexpr ck_tile::index_t M_Warp_Tile = GemmWarpConfig::M_Warp_Tile; + constexpr ck_tile::index_t N_Warp_Tile = GemmWarpConfig::N_Warp_Tile; + constexpr ck_tile::index_t K_Warp_Tile = GemmWarpConfig::K_Warp_Tile; + + constexpr ck_tile::index_t VectorSizeA = 8; + constexpr ck_tile::index_t VectorSizeB = 8; + constexpr ck_tile::index_t VectorSizeC = 8; + + // Implicit GEMM Traits + using CodegenShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence>; + + constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default; + using TilePartitioner = ck_tile::GemmTile1DPartitioner; + using GroupedConvTraitsType = ck_tile::GroupedConvTraits; + using CodegenPipelineProblem = ck_tile::GemmPipelineProblem< + InDataType, + WeiDataType, + AccDataType, + CodegenShape, + typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsFwd, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + InDataType, + true, + GroupedConvTraitsType::VectorSizeA, + GroupedConvTraitsType::VectorSizeB>; + using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1; + + const auto Run = [&](const auto memory_operation_) { + constexpr auto memory_operation = memory_operation_.value; + + using ConvEpilogue = ck_tile::CShuffleEpilogue>; + + using Kernel = ck_tile::GroupedConvolutionForwardKernel; + auto kargs = Kernel::MakeKernelArgs(args); + + const dim3 grids = Kernel::GridSize(kargs); + const dim3 blocks = Kernel::BlockSize(); + + if(!Kernel::IsSupportedArgument(kargs)) + { + throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n"); + } + + if(s.log_level_ > 0) + { + std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n' + << "shape: " << CodegenShape::GetName() << '\n' + << "problem: " << CodegenPipelineProblem::GetName() << '\n' + << "pipeline: " << CodegenPipeline::GetName() << '\n' + << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z + << "}" << '\n' + << "Vector size A: " << CodegenPipeline::GetVectorSizeA() + << ", Vector size B: " << CodegenPipeline::GetVectorSizeB() + << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl; + } + + float ave_time = ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + + return ave_time; + }; + + return Run(ck_tile::integral_constant{}); + } +}; diff --git a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc index 8519daaac2..3d7635bf4f 100644 --- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc +++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc @@ -4,6 +4,7 @@ template ( + float ave_time = Invoker::template grouped_conv_bwd_data( args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat}); std::size_t flop = args.GetFlops(); @@ -39,6 +40,7 @@ float invoke_grouped_conv_bwd_data(ck_tile::GroupedConvBwdDataHostArgs& args, template +int run_grouped_conv_bwd_data_example_prec_type( + std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[]) +{ + using NWGC = ck_tile::tensor_layout::convolution::NWGC; + using NHWGC = ck_tile::tensor_layout::convolution::NHWGC; + using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC; + + using GKXC = ck_tile::tensor_layout::convolution::GKXC; + using GKYXC = ck_tile::tensor_layout::convolution::GKYXC; + using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC; + + using NWGK = ck_tile::tensor_layout::convolution::NWGK; + using NHWGK = ck_tile::tensor_layout::convolution::NHWGK; + using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK; + + if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK") + { + return run_grouped_conv_bwd_data_example_with_layouts{}, + GemmWarpConfig, + Invoker, + InPrecType, + WeiPrecType, + OutPrecType>( + argc, argv, NWGC{}, GKXC{}, NWGK{}); + } + else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK") + { + return run_grouped_conv_bwd_data_example_with_layouts{}, + GemmWarpConfig, + Invoker, + InPrecType, + WeiPrecType, + OutPrecType>( + argc, argv, NHWGC{}, GKYXC{}, NHWGK{}); + } + else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "NDHWGK") + { + return run_grouped_conv_bwd_data_example_with_layouts{}, + GemmWarpConfig, + Invoker, + InPrecType, + WeiPrecType, + OutPrecType>( + argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{}); + } + else + { + throw std::runtime_error("Unsupported memory layout!"); + } +} diff --git a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc index c5ae92a0da..beb6005e19 100644 --- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc +++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc @@ -4,6 +4,7 @@ template ( + float ave_time = Invoker::template grouped_conv_fwd( args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat}); std::size_t flop = args.GetFlops(); @@ -39,6 +40,7 @@ float invoke_grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args, template +int run_grouped_conv_fwd_example_prec_type( + std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[]) +{ + using NWGC = ck_tile::tensor_layout::convolution::NWGC; + using NHWGC = ck_tile::tensor_layout::convolution::NHWGC; + using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC; + + using GKXC = ck_tile::tensor_layout::convolution::GKXC; + using GKYXC = ck_tile::tensor_layout::convolution::GKYXC; + using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC; + + using NWGK = ck_tile::tensor_layout::convolution::NWGK; + using NHWGK = ck_tile::tensor_layout::convolution::NHWGK; + using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK; + + if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK") + { + return run_grouped_conv_fwd_example_with_layouts{}, + GemmWarpConfig, + Invoker, + InPrecType, + WeiPrecType, + OutPrecType>( + argc, argv, NWGC{}, GKXC{}, NWGK{}); + } + else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK") + { + return run_grouped_conv_fwd_example_with_layouts{}, + GemmWarpConfig, + Invoker, + InPrecType, + WeiPrecType, + OutPrecType>( + argc, argv, NHWGC{}, GKYXC{}, NHWGK{}); + } + else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "NDHWGK") + { + return run_grouped_conv_fwd_example_with_layouts{}, + GemmWarpConfig, + Invoker, + InPrecType, + WeiPrecType, + OutPrecType>( + argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{}); + } + else + { + throw std::runtime_error("Unsupported memory layout!"); + } +} diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp index ad9e2959f5..5eac387a66 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp @@ -408,8 +408,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy sequence<1, 2>, sequence<2, 1>>{}); - if constexpr(container_reduce(dstr.get_lengths(), std::multiplies{}, 1) == - kNPerBlock * kKPerBlock) + if constexpr((kKPerBlock & (kKPerBlock - 1)) == 0) // kKPerBlock is power of 2 { return dstr; } @@ -457,8 +456,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy tuple, sequence<2, 0>>, sequence<1, 2>, // N0 K1 sequence<0, 1>>{}); - if constexpr(container_reduce(dstr.get_lengths(), std::multiplies{}, 1) == - kNPerBlock * kKPerBlock) + if constexpr((kKPerBlock & (kKPerBlock - 1)) == 0) // kKPerBlock is power of 2 { return dstr; } @@ -507,8 +505,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy sequence<1, 2>, sequence<2, 1>>{}); - if constexpr(container_reduce(dstr.get_lengths(), std::multiplies{}, 1) == - kMPerBlock * kKPerBlock) + if constexpr((kKPerBlock & (kKPerBlock - 1)) == 0) // kKPerBlock is power of 2 { return dstr; } @@ -558,8 +555,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy sequence<1, 2>, sequence<2, 1>>{}); - if constexpr(container_reduce(dstr.get_lengths(), std::multiplies{}, 1) == - kMPerBlock * kKPerBlock) + if constexpr((kKPerBlock & (kKPerBlock - 1)) == 0) // kKPerBlock is power of 2 { return dstr; }