From 899f325a64f15dee96fa982fac8a39af06f73f66 Mon Sep 17 00:00:00 2001 From: "assistant-librarian[bot]" Date: Wed, 30 Jul 2025 15:13:07 +0000 Subject: [PATCH] Merge commit 'e8709c24f403173ad21a2da907d1347957e324fb' into develop --- include/ck_tile/core/numeric/pk_fp4.hpp | 8 +- script/install_precommit.sh | 2 +- test/ck_tile/batched_transpose/CMakeLists.txt | 4 +- .../test_batched_transpose.cpp | 263 ++++++++++++++++++ 4 files changed, 269 insertions(+), 8 deletions(-) create mode 100644 test/ck_tile/batched_transpose/test_batched_transpose.cpp diff --git a/include/ck_tile/core/numeric/pk_fp4.hpp b/include/ck_tile/core/numeric/pk_fp4.hpp index b7dca9dd0a..0dee750b69 100644 --- a/include/ck_tile/core/numeric/pk_fp4.hpp +++ b/include/ck_tile/core/numeric/pk_fp4.hpp @@ -55,8 +55,8 @@ struct pk_float4_e2m1_t CK_TILE_HOST_DEVICE constexpr operator bf16x2_t() const; template - CK_TILE_HOST_DEVICE raw_type unpack(number) const; - CK_TILE_HOST_DEVICE static pk_float4_e2m1_t pack(const type x0, const type x1) + CK_TILE_HOST_DEVICE constexpr raw_type unpack(number) const; + CK_TILE_HOST_DEVICE constexpr static pk_float4_e2m1_t pack(const type x0, const type x1) { return (x1 << 4) | (x0 & 0b00001111); } @@ -130,7 +130,7 @@ struct numeric }; template -CK_TILE_HOST_DEVICE pk_fp4_raw_t pk_fp4_t::unpack(number) const +CK_TILE_HOST_DEVICE constexpr pk_fp4_raw_t pk_fp4_t::unpack(number) const { static_assert(I < 2, "Index is out of range."); if constexpr(I == 1) @@ -147,7 +147,6 @@ namespace impl { template CK_TILE_DEVICE T _from_f4(pk_fp4_raw_t src, float scale = 1.0f) { - // TODO: check the order if constexpr(std::is_same_v) return fp32x2_t(__builtin_amdgcn_cvt_scalef32_pk_f32_fp4(src, scale, 0))[0]; else if constexpr(std::is_same_v) @@ -167,7 +166,6 @@ CK_TILE_DEVICE T _from_f4(pk_fp4_raw_t src, float scale = 1.0f) template CK_TILE_DEVICE pk_fp4_raw_t _to_f4(T src, float scale = 1.0f) { - // TODO: check the order union { uint32_t u32; diff --git a/script/install_precommit.sh b/script/install_precommit.sh index 6132f6a287..fd1840290e 100755 --- a/script/install_precommit.sh +++ b/script/install_precommit.sh @@ -15,7 +15,7 @@ source "$(dirname "$0")/../.venv/bin/activate" echo "I: Installing tools required for pre-commit checks..." run_and_check pip install dos2unix -run_and_check pip install clang-format==12.0.1 +run_and_check pip install clang-format==18.1.3 echo "I: Installing pre-commit in virtual environment..." run_and_check pip install pre-commit run_and_check pre-commit install diff --git a/test/ck_tile/batched_transpose/CMakeLists.txt b/test/ck_tile/batched_transpose/CMakeLists.txt index ac8e3dac49..f2ef158a4d 100644 --- a/test/ck_tile/batched_transpose/CMakeLists.txt +++ b/test/ck_tile/batched_transpose/CMakeLists.txt @@ -26,8 +26,8 @@ if(GPU_TARGETS MATCHES "gfx9") add_batched_transpose_test(test_ck_tile_batched_transpose_bf16 batched_transpose_bf16.cpp) add_dependencies(${CUSTOM_TARGET_NAME} test_ck_tile_batched_transpose_bf16) - - + add_gtest_executable(test_batched_transpose test_batched_transpose.cpp) + set_property(TARGET test_batched_transpose PROPERTY CXX_STANDARD 20) else() message(DEBUG "Skipping ck_tile batched_transpose tests for current target") endif() diff --git a/test/ck_tile/batched_transpose/test_batched_transpose.cpp b/test/ck_tile/batched_transpose/test_batched_transpose.cpp new file mode 100644 index 0000000000..85008a51a2 --- /dev/null +++ b/test/ck_tile/batched_transpose/test_batched_transpose.cpp @@ -0,0 +1,263 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +#include +#include + +#include "ck_tile/host.hpp" +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" + +#include "ck_tile/ops/batched_transpose.hpp" + +enum class PipelineTag : ck_tile::index_t +{ + Universal, + LDSLoadTranspose, +}; + +template +struct PipelineSelector +{ +}; + +template <> +struct PipelineSelector +{ + template + using Problem = ck_tile::BatchedTransposeProblem; + + using Policy = ck_tile::BatchedTransposePolicy; + + template + using Pipeline = ck_tile::BatchedTransposePipeline; +}; + +template <> +struct PipelineSelector +{ + template + using Problem = + ck_tile::BatchedTransposeLdsProblem; + + using Policy = ck_tile::BatchedTransposeLdsPolicy; + + template + using Pipeline = ck_tile::BatchedTransposeLdsPipeline; +}; + +template +struct PipelineConfig +{ + using DataType = DataType_; + using BlockTile = ck_tile::sequence; + using WarpLayout = ck_tile::sequence; + static constexpr bool kPadM = kPadM_; + static constexpr bool kPadN = kPadN_; + static constexpr PipelineTag kPipelineId = kPipelineId_; + static constexpr ck_tile::index_t kBlockX = kBlockX_; + static constexpr ck_tile::index_t kBlockY = kBlockY_; + static constexpr ck_tile::index_t kNumWarpsX = kNumWarpsX_; + static constexpr ck_tile::index_t kNumWarpsY = kNumWarpsY_; + + using Problem = typename PipelineSelector< + kPipelineId_>::template Problem; + using Pipeline = typename PipelineSelector::template Pipeline; + using Kernel = ck_tile::BatchedTransposeKernel; +}; + +template +class TestCkTileBatchedTranspose // N C H W layout_in==NCHW + : public ::testing::TestWithParam> +{ + protected: + void Run(std::tuple param) + { + using DataType = typename Config::DataType; + const auto [N, C, H, W, nchw2nhwc] = param; + const std::string layout_in = nchw2nhwc ? "NCHW" : "NHWC"; + const std::string layout_out = nchw2nhwc ? "NHWC" : "NCHW"; + const auto X_dim = nchw2nhwc ? std::array{N, C, H, W} : std::array{N, H, W, C}; + const auto X_stride = + nchw2nhwc ? std::array{C * H * W, H * W, W, 1} : std::array{C * H * W, C * W, C, 1}; + ck_tile::HostTensor x_host(X_dim, X_stride); + const auto Y_dim = nchw2nhwc ? std::array{N, H, W, C} : std::array{N, C, H, W}; + const auto Y_stride = + nchw2nhwc ? std::array{C * H * W, C * W, C, 1} : std::array{C * H * W, H * W, W, 1}; + ck_tile::HostTensor y_host(Y_dim, Y_stride); + ck_tile::HostTensor y_ref(Y_dim, Y_stride); + + ck_tile::FillUniformDistribution{-.5f, .5f}(x_host); + + ck_tile::DeviceMem x_dev(x_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem y_dev(y_host.get_element_space_size_in_bytes()); + x_dev.ToDevice(x_host.data()); + + using Kernel = typename Config::Kernel; + + const ck_tile::index_t height = nchw2nhwc ? C : H * W; + const ck_tile::index_t width = nchw2nhwc ? H * W : C; + + if(height % Config::kBlockX != 0 && !Config::kPadM) + { + GTEST_SKIP_("Input cannot be covered with block tiles and Kernel does not force height " + "padding"); + } + + if(width % Config::kBlockY != 0 && !Config::kPadN) + { + GTEST_SKIP_( + "Input cannot be covered with block tiles and Kernel does not force width padding"); + } + + const auto device_name = ck_tile::get_device_name(); + + if(Config::kPipelineId == PipelineTag::LDSLoadTranspose && + device_name.find("gfx950") == std::string::npos) + { + GTEST_SKIP_( + std::format("LDS Load Transpose cannot be launched with {}", device_name).c_str()); + } + + const auto host_args = ck_tile::BatchedTransposeHostArgs{x_dev.GetDeviceBuffer(), + y_dev.GetDeviceBuffer(), + N, + height, + width, + height * width, + Config::BlockTile::at(1), + Config::BlockTile::at(0)}; + auto kargs = Kernel::MakeKargs(host_args); + + auto sc = ck_tile::stream_config{}; + const dim3 grid_size = Kernel::GridSize(host_args); + constexpr dim3 block_size = Kernel::BlockSize(); + ck_tile::launch_kernel( + sc, ck_tile::make_kernel(Kernel{}, grid_size, block_size, 0, kargs)); + y_dev.FromDevice(y_host.data()); + ck_tile::reference_batched_transpose(x_host, y_ref, layout_in, layout_out); + + std::ostringstream message; + message << "N=" << N << " C=" << C << " H=" << H << " W=" << W << " layout_in=" << layout_in + << " layout_out=" << layout_out << " device_name=" << device_name; + + bool pass = ck_tile::check_err( + y_ref, y_host, message.str(), /* rtol */ 0, /* atol */ 0, /* allow inf */ false); + + EXPECT_TRUE(pass); + } +}; + +// clang-format off +// the default indent is not sane +static const auto kTestingValues = ::testing::Values( +// N C H W layout_in==NCHW + std::tuple{1, 32, 1, 32, true}, + std::tuple{1, 64, 1, 64, true}, + std::tuple{2, 12, 1, 32, false}, + std::tuple{3, 1334, 1, 37, false}, + std::tuple{4, 27, 1, 32, true}, + std::tuple{5, 1234, 1, 12, true}, + std::tuple{1, 1, 1, 1, true}, + std::tuple{1, 1, 1, 1, false}, + std::tuple{128, 1024, 64, 64, true}, + std::tuple{128, 1024, 64, 64, false}, + std::tuple{16, 64, 32, 128, true}, + std::tuple{16, 64, 128, 32, false}, + std::tuple{1, 2048, 1, 1, true}, + std::tuple{1, 2048, 1, 1, false}, + std::tuple{1, 1, 1024, 1024, true}, + std::tuple{1, 1, 1024, 1024, false}, + std::tuple{8, 16, 8, 16, true}, + std::tuple{8, 16, 8, 16, false}, + std::tuple{1, 64, 1, 1024, true}, + std::tuple{1, 64, 1024, 1, false} +); +// clang-format on + +class CaseHalf : public TestCkTileBatchedTranspose> +{ +}; + +class CaseByte : public TestCkTileBatchedTranspose> +{ +}; + +class CaseWord : public TestCkTileBatchedTranspose> +{ +}; + +class CaseHalfLoadTranspose : public TestCkTileBatchedTranspose< + PipelineConfig> +{ +}; + +class CaseByteLoadTranspose : public TestCkTileBatchedTranspose< + PipelineConfig> +{ +}; + +class CaseHalfPad + : public TestCkTileBatchedTranspose< + PipelineConfig> +{ +}; + +class CaseHalfPadLoadTranspose + : public TestCkTileBatchedTranspose> +{ +}; + +class CaseHalfPadMultiWarp + : public TestCkTileBatchedTranspose< + PipelineConfig> +{ +}; + +class CaseHalfPadMultiWarpLoadTranspose + : public TestCkTileBatchedTranspose> +{ +}; + +TEST_P(CaseHalf, TestCorrectness) { this->Run(GetParam()); } +TEST_P(CaseByte, TestCorrectness) { this->Run(GetParam()); } +TEST_P(CaseWord, TestCorrectness) { this->Run(GetParam()); } +TEST_P(CaseHalfLoadTranspose, TestCorrectness) { this->Run(GetParam()); } +TEST_P(CaseByteLoadTranspose, TestCorrectness) { this->Run(GetParam()); } +TEST_P(CaseHalfPad, TestCorrectness) { this->Run(GetParam()); } +TEST_P(CaseHalfPadLoadTranspose, TestCorrectness) { this->Run(GetParam()); } +TEST_P(CaseHalfPadMultiWarp, TestCorrectness) { this->Run(GetParam()); } +TEST_P(CaseHalfPadMultiWarpLoadTranspose, TestCorrectness) { this->Run(GetParam()); } + +// clang-format off +INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalf, kTestingValues); +INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseByte, kTestingValues); +INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseWord, kTestingValues); +INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfLoadTranspose, kTestingValues); +INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseByteLoadTranspose, kTestingValues); +INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPad, kTestingValues); +INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadLoadTranspose, kTestingValues); +INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarp, kTestingValues); +INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarpLoadTranspose, kTestingValues); +// clang-format on