diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bfa77b8445..bb80273102 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -31,6 +31,11 @@ repos: entry: projects/composablekernel/script/check_ascii_only.sh language: script types_or: [c++, inc] + - id: crlf-checker + name: Check for CRLF line endings in C/C++ sources + entry: projects/composablekernel/script/check_no_crlf.sh + language: script + types_or: [c++, inc] - id: remove-exec-bit name: Remove executable bit from non-executable files entry: projects/composablekernel/script/remove_exec_bit.sh diff --git a/Jenkinsfile b/Jenkinsfile index bd6b00360f..3197b222ce 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -348,6 +348,24 @@ pipeline { cleanWs() } } + stage('CRLF Check') { + agent{ label rocmnode("nogpu") } + environment{ + setup_args = "NO_CK_BUILD" + execute_cmd = """cd .. && \ + find . -type f \\( -name '*.h' -o -name '*.hpp' -o -name '*.cpp' -o -name '*.h.in' -o -name '*.hpp.in' -o -name '*.cpp.in' -o -name '*.inc' -o -name '*.cl' \\) \ + -not -path '*/build/*' -not -path '*/include/rapidjson/*' \ + -print0 | xargs -0 -P 8 -n 64 script/check_no_crlf.sh""" + } + steps{ + deleteDir() + script { + loadCk(); + ck.buildAndTest(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd) + } + cleanWs() + } + } } } stage("Run Downstream Tests") diff --git a/include/ck/utility/data_cache_prefetch.hpp b/include/ck/utility/data_cache_prefetch.hpp index 8eeb726b44..e3e3625381 100644 --- a/include/ck/utility/data_cache_prefetch.hpp +++ b/include/ck/utility/data_cache_prefetch.hpp @@ -1,33 +1,33 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#pragma once - -#include "ck/utility/amd_buffer_coherence.hpp" - -namespace ck { - -template -struct GlobalPrefetchDataOp -{ - // addr needs to point to global memory! - __device__ __forceinline__ void operator()([[maybe_unused]] const void* addr) const - { -#if defined(__gfx125__) - __builtin_amdgcn_global_prefetch(addr, static_cast(Coherence_)); -#endif - } -}; - -template -struct FlatPrefetchDataOp -{ - __device__ __forceinline__ void operator()([[maybe_unused]] const void* addr) const - { -#if defined(__gfx125__) - __builtin_amdgcn_flat_prefetch(addr, static_cast(Coherence_)); -#endif - } -}; - -} // namespace ck +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "ck/utility/amd_buffer_coherence.hpp" + +namespace ck { + +template +struct GlobalPrefetchDataOp +{ + // addr needs to point to global memory! + __device__ __forceinline__ void operator()([[maybe_unused]] const void* addr) const + { +#if defined(__gfx125__) + __builtin_amdgcn_global_prefetch(addr, static_cast(Coherence_)); +#endif + } +}; + +template +struct FlatPrefetchDataOp +{ + __device__ __forceinline__ void operator()([[maybe_unused]] const void* addr) const + { +#if defined(__gfx125__) + __builtin_amdgcn_flat_prefetch(addr, static_cast(Coherence_)); +#endif + } +}; + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_data_cache_prefetch_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_data_cache_prefetch_instance.cpp index d3eef8f907..5941ef3179 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_data_cache_prefetch_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_data_cache_prefetch_instance.cpp @@ -1,80 +1,80 @@ -// Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -// SPDX-License-Identifier: MIT - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp" - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" -#include "ck/host_utility/device_prop.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -using BF16 = ck::bhalf_t; -using F32 = float; - -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; - -template -using S = ck::Sequence; - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; -static constexpr auto BlkGemmPipeSched = ck::BlockGemmPipelineScheduler::Intrawave; -static constexpr auto BlkGemmPipeVer = ck::BlockGemmPipelineVersion::v3; - -// A[m, k] * B[n, k] = C[m, n] with data cache prefetch support -template -using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_instances = std::tuple< - // clang-format off - //#########################|ALayout|BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| Block| MPer| NPer| KPer | AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| | | Compute | Compute | Permute | Minimum | Use | - //#########################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| PipeScheduler| PipelineVer| TypeA | TypeB | A/B | Occupancy| DataCachePrefetch | - //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | | | | | | - //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - // 128x128x64 - DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 64, 8, 8, 16, 16, 4, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlkGemmPipeVer, BF16, BF16, false, 0, UseDataCachePrefetch>, - // 256x128x64 - DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 256, 128, 64, 8, 8, 16, 16, 8, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlkGemmPipeVer, BF16, BF16, false, 0, UseDataCachePrefetch>, - // 128x256x64 - DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 256, 64, 8, 8, 16, 16, 4, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlkGemmPipeVer, BF16, BF16, false, 0, UseDataCachePrefetch>, - // 256x256x64 - DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 256, 256, 64, 8, 8, 16, 16, 8, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlkGemmPipeVer, BF16, BF16, false, 0, UseDataCachePrefetch> - // clang-format on - >; - -void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_prefetch_instances( - std::vector>>& - instances) -{ - if(ck::is_gfx125_supported()) - { - add_device_operation_instances( - instances, device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_instances{}); - } -} - -void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_no_prefetch_instances( - std::vector>>& - instances) -{ - if(ck::is_gfx125_supported()) - { - add_device_operation_instances( - instances, device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_instances{}); - } -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/host_utility/device_prop.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using BF16 = ck::bhalf_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; +static constexpr auto BlkGemmPipeSched = ck::BlockGemmPipelineScheduler::Intrawave; +static constexpr auto BlkGemmPipeVer = ck::BlockGemmPipelineVersion::v3; + +// A[m, k] * B[n, k] = C[m, n] with data cache prefetch support +template +using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_instances = std::tuple< + // clang-format off + //#########################|ALayout|BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| Block| MPer| NPer| KPer | AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| | | Compute | Compute | Permute | Minimum | Use | + //#########################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| PipeScheduler| PipelineVer| TypeA | TypeB | A/B | Occupancy| DataCachePrefetch | + //#########################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | | | | | | + //#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + // 128x128x64 + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 64, 8, 8, 16, 16, 4, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlkGemmPipeVer, BF16, BF16, false, 0, UseDataCachePrefetch>, + // 256x128x64 + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 256, 128, 64, 8, 8, 16, 16, 8, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlkGemmPipeVer, BF16, BF16, false, 0, UseDataCachePrefetch>, + // 128x256x64 + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 256, 64, 8, 8, 16, 16, 4, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlkGemmPipeVer, BF16, BF16, false, 0, UseDataCachePrefetch>, + // 256x256x64 + DeviceGemm_Xdl_CShuffleV3< Row, Col, Row, BF16, BF16, BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 256, 256, 64, 8, 8, 16, 16, 8, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, 1, 2, S<1, 32, 1, 8>, 8, BlkGemmPipeSched, BlkGemmPipeVer, BF16, BF16, false, 0, UseDataCachePrefetch> + // clang-format on + >; + +void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_prefetch_instances( + std::vector>>& + instances) +{ + if(ck::is_gfx125_supported()) + { + add_device_operation_instances( + instances, device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_instances{}); + } +} + +void add_device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_no_prefetch_instances( + std::vector>>& + instances) +{ + if(ck::is_gfx125_supported()) + { + add_device_operation_instances( + instances, device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_v3_instances{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/script/check_no_crlf.sh b/script/check_no_crlf.sh new file mode 100755 index 0000000000..514213fad2 --- /dev/null +++ b/script/check_no_crlf.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +# Rejects Windows CRLF line endings (a trailing carriage return) in the +# files passed as arguments. Used both by the local pre-commit hook and +# by the Jenkinsfile "CRLF Check" static-check stage. +# +# Usage: ./check_no_crlf.sh ... + +exit_code=0 + +for file in "$@"; do + [[ -f "$file" ]] || continue + if LC_ALL=C grep -qP '\r$' "$file" 2>/dev/null; then + echo "ERROR: $file contains CRLF (Windows) line endings:" + LC_ALL=C grep -nP '\r$' "$file" | head -20 | sed 's/\r$//' + echo " Fix: convert to LF, e.g. 'sed -i 's/\\r\$//' $file' or 'dos2unix $file'" + exit_code=1 + fi +done + +exit $exit_code diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_preshuffleQuant.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_preshuffleQuant.cpp index e7ba00e4ef..7375f43f1d 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_preshuffleQuant.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_preshuffleQuant.cpp @@ -1,31 +1,31 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#include "test_gemm_quant_common.hpp" - -using GroupSize2D128N = ck_tile::QuantGroupShape>; - -// Type combinations for ABQuant tests -// Tuple format: -// clang-format off -using ABQuantPreshuffleQuantTypes = ::testing::Types< - std::tuple, GroupSize1D_128, GroupSize1D_128, ColumnMajor>, - std::tuple, GroupSize1D_128, GroupSize2D128N, ColumnMajor> ->; -// clang-format on - -// Test suite for ABQuant -TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPreshuffleQuantTypes); - -// AQuant tests -TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest) -{ - using BQuantGroupSize = std::tuple_element_t<11, TypeParam>; - if(ck_tile::is_gfx120_supported() && std::is_same_v) - { - GTEST_SKIP() << "temp disable due to random fail on gfx120."; - } - - this->run_test_with_validation(1024, 1024, 1024); -} +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "test_gemm_quant_common.hpp" + +using GroupSize2D128N = ck_tile::QuantGroupShape>; + +// Type combinations for ABQuant tests +// Tuple format: +// clang-format off +using ABQuantPreshuffleQuantTypes = ::testing::Types< + std::tuple, GroupSize1D_128, GroupSize1D_128, ColumnMajor>, + std::tuple, GroupSize1D_128, GroupSize2D128N, ColumnMajor> +>; +// clang-format on + +// Test suite for ABQuant +TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPreshuffleQuantTypes); + +// AQuant tests +TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest) +{ + using BQuantGroupSize = std::tuple_element_t<11, TypeParam>; + if(ck_tile::is_gfx120_supported() && std::is_same_v) + { + GTEST_SKIP() << "temp disable due to random fail on gfx120."; + } + + this->run_test_with_validation(1024, 1024, 1024); +} diff --git a/test/prefetch_op/prefetch_op.cpp b/test/prefetch_op/prefetch_op.cpp index 66cc6ba9c7..a3bc8db411 100644 --- a/test/prefetch_op/prefetch_op.cpp +++ b/test/prefetch_op/prefetch_op.cpp @@ -1,74 +1,74 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#include "ck/ck.hpp" -#include "ck/host_utility/device_prop.hpp" - -#include "prefetch_op_util.hpp" - -template -bool run_test(bool time_kernels) -{ - bool pass = true; - -#if defined(__gfx125__) - const auto coherence = - IS_L1_PREFETCH ? ck::AmdBufferCoherenceEnum::CU_RT : ck::AmdBufferCoherenceEnum::SE_RT; - using global_prefetch_op = ck::GlobalPrefetchDataOp; - using flat_prefetch_op = ck::FlatPrefetchDataOp; -#else - using global_prefetch_op = ck::GlobalPrefetchDataOp<>; - using flat_prefetch_op = ck::FlatPrefetchDataOp<>; -#endif - - const auto global_prefetch_kernel = - ck::prefetch_op_util::kernel_with_prefetch; - const auto flat_prefetch_kernel = ck::prefetch_op_util:: - kernel_with_prefetch_and_shared_mem; - - const auto prefetch_kernel_container = - std::make_tuple(global_prefetch_kernel, flat_prefetch_kernel); - - ck::static_for<0, 2, 1>{}([&](auto i) { - std::string kernel_name = (i == 1 ? "flat_prefetch" : "global_prefetch"); - - auto kernel = std::get{}>(prefetch_kernel_container); - - pass &= - ck::prefetch_op_util::test_prefetch_impl( - time_kernels, kernel, kernel_name); - }); - - return pass; -} - -int main(int argc, char* argv[]) -{ - if(!ck::is_gfx125_supported()) - { - std::cout << "This feature is not supported by current HW, skipping tests." << std::endl; - return 0; - } - - bool time_kernels = false; - - if(argc == 2) - { - time_kernels = std::stoi(argv[1]); - } - - bool pass = true; - - std::cout << "=== Testing L2 Global Cache Prefetch ===" << std::endl; - - pass &= run_test(time_kernels); - pass &= run_test(time_kernels); - - std::cout << "=== Testing L1 Global Cache Prefetch ===" << std::endl; - - pass &= run_test(time_kernels); - pass &= run_test(time_kernels); - - std::cout << "TestGlobalPrefetch ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl; - return pass ? 0 : 1; -} +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "ck/ck.hpp" +#include "ck/host_utility/device_prop.hpp" + +#include "prefetch_op_util.hpp" + +template +bool run_test(bool time_kernels) +{ + bool pass = true; + +#if defined(__gfx125__) + const auto coherence = + IS_L1_PREFETCH ? ck::AmdBufferCoherenceEnum::CU_RT : ck::AmdBufferCoherenceEnum::SE_RT; + using global_prefetch_op = ck::GlobalPrefetchDataOp; + using flat_prefetch_op = ck::FlatPrefetchDataOp; +#else + using global_prefetch_op = ck::GlobalPrefetchDataOp<>; + using flat_prefetch_op = ck::FlatPrefetchDataOp<>; +#endif + + const auto global_prefetch_kernel = + ck::prefetch_op_util::kernel_with_prefetch; + const auto flat_prefetch_kernel = ck::prefetch_op_util:: + kernel_with_prefetch_and_shared_mem; + + const auto prefetch_kernel_container = + std::make_tuple(global_prefetch_kernel, flat_prefetch_kernel); + + ck::static_for<0, 2, 1>{}([&](auto i) { + std::string kernel_name = (i == 1 ? "flat_prefetch" : "global_prefetch"); + + auto kernel = std::get{}>(prefetch_kernel_container); + + pass &= + ck::prefetch_op_util::test_prefetch_impl( + time_kernels, kernel, kernel_name); + }); + + return pass; +} + +int main(int argc, char* argv[]) +{ + if(!ck::is_gfx125_supported()) + { + std::cout << "This feature is not supported by current HW, skipping tests." << std::endl; + return 0; + } + + bool time_kernels = false; + + if(argc == 2) + { + time_kernels = std::stoi(argv[1]); + } + + bool pass = true; + + std::cout << "=== Testing L2 Global Cache Prefetch ===" << std::endl; + + pass &= run_test(time_kernels); + pass &= run_test(time_kernels); + + std::cout << "=== Testing L1 Global Cache Prefetch ===" << std::endl; + + pass &= run_test(time_kernels); + pass &= run_test(time_kernels); + + std::cout << "TestGlobalPrefetch ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl; + return pass ? 0 : 1; +} diff --git a/test/prefetch_op/prefetch_op_util.hpp b/test/prefetch_op/prefetch_op_util.hpp index 47ecbc3d36..790eb2ab98 100644 --- a/test/prefetch_op/prefetch_op_util.hpp +++ b/test/prefetch_op/prefetch_op_util.hpp @@ -1,276 +1,276 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#include "ck/utility/common_header.hpp" - -#include "ck/ck.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/check_err.hpp" -#include "ck/host_utility/hip_check_error.hpp" -#include "ck/host_utility/kernel_launch.hpp" -#include "ck/host_utility/flush_cache.hpp" - -#include - -#include "ck/utility/data_cache_prefetch.hpp" - -namespace ck { -namespace prefetch_op_util { - -template -struct KernelArgs -{ - const T* p_a_grid; - T* dst; - const T* p_b_grid; - bool enable_prefetch; -}; - -template -__global__ void kernel_with_prefetch(KernelArgs args) -{ - const T* src = args.p_a_grid; - T* dst = args.dst; - const T* scalar_data = args.p_b_grid; - bool enable_prefetch = args.enable_prefetch; - - uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x; - - // Calculate number of 32B cachelines needed to cover num_scalars elements - constexpr index_t cachelineSize = 32; - constexpr index_t elements_per_cachelineSize = cachelineSize / sizeof(T); - constexpr unsigned int cachelinesNeeded = - (NUM_SCALARS + elements_per_cachelineSize - 1) / elements_per_cachelineSize; - - const char* byte_addr = reinterpret_cast(scalar_data); - - // Prefetch all scalar data at once - if(tid < cachelinesNeeded) - { - if(enable_prefetch) - { - // Prefetch the cacheline - PrefetchOp{}(byte_addr + tid * cachelineSize); - } - } - - T sum = 0; - if(tid < NUM_THREADS) - { - sum = src[tid]; // load from global mem to give time for prefetch to finish or be close to - // finish - } - __syncthreads(); // waits on loads from global mem - if(tid < NUM_THREADS) - { - // Access prefetched scalar data - for(uint32_t i = 0; i < NUM_SCALARS; i++) - { - sum += scalar_data[i]; // should be fast due to scalars being preloaded - } - - dst[tid] = sum; - } -} - -template -__global__ void kernel_with_prefetch_and_shared_mem(KernelArgs args) -{ - const T* src = args.p_a_grid; - T* dst = args.dst; - const T* scalar_data = args.p_b_grid; - bool enable_prefetch = args.enable_prefetch; - - __shared__ T sharedMem[32]; - - uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x; - - // Calculate number of 32B cachelines needed to cover num_scalars elements - constexpr index_t cachelineSize = 32; - constexpr index_t elements_per_cachelineSize = cachelineSize / sizeof(T); - constexpr unsigned int cachelinesNeeded = - (NUM_SCALARS + elements_per_cachelineSize - 1) / elements_per_cachelineSize; - - bool use_shared_mem = tid % 2 == 1; - - const void* byte_addr; - if(use_shared_mem) - { - byte_addr = reinterpret_cast(sharedMem); - } - else - { - uintptr_t base = reinterpret_cast(scalar_data); - uintptr_t offset = base + (tid / 2) * cachelineSize; - byte_addr = reinterpret_cast(offset); - } - - // Prefetch all scalar data at once - if(tid < cachelinesNeeded * 2) - { - if(enable_prefetch) - { - // Prefetch the cacheline - PrefetchOp{}(byte_addr); - } - else - { - (void)byte_addr; - } - } - - T sum = 0; - if(tid < NUM_THREADS) - { - sum = src[tid]; // load from global mem to give time for prefetch to finish or be close to - // finish - } - __syncthreads(); // waits on loads from global mem - if(tid < NUM_THREADS) - { - // Access prefetched scalar data - for(uint32_t i = 0; i < NUM_SCALARS; i++) - { - sum += scalar_data[i]; // should be fast due to scalars being preloaded - } - - dst[tid] = sum; - } -} - -template -bool test_prefetch_impl(bool time_kernels, - const PrefetchKernel& prefetch_kernel, - const std::string& kernel_name) -{ - constexpr index_t block_size = 256; - constexpr index_t num_elements = NUM_THREADS; - constexpr index_t num_scalars = NUM_SCALARS; - - // TODO: maybe add more prefetch instructions inside kernel to support more values - assert(NUM_SCALARS / sizeof(T) < (32 * block_size) && - "Too many scalars to prefetch with current implementation!"); - - constexpr index_t grid_size = (num_elements + block_size - 1) / block_size; - - std::cout << "Testing " << kernel_name << " for type: " << typeid(T).name() << std::endl; - std::cout << "Elements: " << num_elements << ", Scalars: " << num_scalars << std::endl; - - // Host data - std::vector h_src(num_elements); - std::vector h_scalar(num_scalars); - std::vector h_dst_with_prefetch_chunks(num_elements); - std::vector h_expected(num_elements); - - // Initialize data - for(index_t i = 0; i < num_elements; i++) - { - h_src[i] = static_cast(i % 100); - } - - T scalar_sum = 0; - for(index_t i = 0; i < num_scalars; i++) - { - h_scalar[i] = static_cast(i + 1); - scalar_sum += h_scalar[i]; - } - - // Expected results - for(index_t i = 0; i < num_elements; i++) - { - h_expected[i] = h_src[i] + scalar_sum; - } - - // Device memory - DeviceMem d_src(sizeof(T) * num_elements); - DeviceMem d_scalar(sizeof(T) * num_scalars); - DeviceMem d_dst_with_prefetch_chunks(sizeof(T) * num_elements); - - d_src.ToDevice(h_src.data()); - d_scalar.ToDevice(h_scalar.data()); - - KernelArgs args{static_cast(d_src.GetDeviceBuffer()), - static_cast(d_dst_with_prefetch_chunks.GetDeviceBuffer()), - static_cast(d_scalar.GetDeviceBuffer()), - true}; - if(time_kernels) - { - std::array avg_times_us; - ck::static_for<0, 2, 1>{}([&](auto static_i) { - constexpr bool prefetch_enabled = static_i == 0; - std::cout << "PREFETCH " << (prefetch_enabled ? "ENABLED!" : "DISABLED!") << std::endl; - - args.enable_prefetch = prefetch_enabled; - - constexpr int num_warmup = 1; - constexpr int num_iterations = 10; - constexpr int rotating_count = num_iterations; - auto size_a_buffer = d_src.GetBufferSize(); - auto size_b_buffer = d_scalar.GetBufferSize(); - - ck::utility::RotatingMemWrapper> rotating_mem( - args, rotating_count, size_a_buffer, size_b_buffer); - rotating_mem.Print(); - - auto run_flush_cache = [&]() { - // flush icache - ck::utility::flush_icache(); - // rotating mem - rotating_mem.Next(); - }; - float avg_time_ms = ck::utility::launch_and_time_kernel_with_preprocess( - StreamConfig{nullptr, true, 0, num_warmup, num_iterations, true, rotating_count}, - run_flush_cache, - prefetch_kernel, - dim3(grid_size), - dim3(block_size), - 0, - args); - - float avg_time_us = avg_time_ms * 1000.0f; - float total_bytes = (size_a_buffer + size_b_buffer); // read - float bandwidth_gb_s = (total_bytes / (avg_time_us * 1e-6)) / 1e9; - float ops_per_iteration = num_elements * num_scalars; // adds - float gflops = (ops_per_iteration / (avg_time_us * 1e-6)) / 1e9; - - std::cout << " Performance: " << std::endl; - std::cout << " Average kernel time: " << avg_time_us << " us" << std::endl; - std::cout << " Effective bandwidth: " << bandwidth_gb_s << " GB/s" << std::endl; - std::cout << " Compute throughput: " << gflops << " GFLOPS" << std::endl; - - avg_times_us[static_i] = avg_time_us; - }); - - float speedup = avg_times_us[1] / avg_times_us[0]; - - std::cout << "On average kernel with prefetch is " << speedup - << " times faster than without prefetch." << std::endl; - - if(speedup < 1.0f) - std::cout << "WARNING: prefetch kernel is slower!" << std::endl; - } - else - { - launch_and_time_kernel(StreamConfig{nullptr, false}, - prefetch_kernel, - dim3(grid_size), - dim3(block_size), - 0, // lds_byte - args); - } - - // Copy results back - d_dst_with_prefetch_chunks.FromDevice(h_dst_with_prefetch_chunks.data()); - - // Verify results - bool pass = ck::utils::check_err(h_dst_with_prefetch_chunks, h_expected); - - std::cout << " Correctness: " << (pass ? "PASS" : "FAIL") << std::endl; - std::cout << std::endl; - - return pass; -} - -} // namespace prefetch_op_util -} // namespace ck +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "ck/utility/common_header.hpp" + +#include "ck/ck.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/host_utility/hip_check_error.hpp" +#include "ck/host_utility/kernel_launch.hpp" +#include "ck/host_utility/flush_cache.hpp" + +#include + +#include "ck/utility/data_cache_prefetch.hpp" + +namespace ck { +namespace prefetch_op_util { + +template +struct KernelArgs +{ + const T* p_a_grid; + T* dst; + const T* p_b_grid; + bool enable_prefetch; +}; + +template +__global__ void kernel_with_prefetch(KernelArgs args) +{ + const T* src = args.p_a_grid; + T* dst = args.dst; + const T* scalar_data = args.p_b_grid; + bool enable_prefetch = args.enable_prefetch; + + uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x; + + // Calculate number of 32B cachelines needed to cover num_scalars elements + constexpr index_t cachelineSize = 32; + constexpr index_t elements_per_cachelineSize = cachelineSize / sizeof(T); + constexpr unsigned int cachelinesNeeded = + (NUM_SCALARS + elements_per_cachelineSize - 1) / elements_per_cachelineSize; + + const char* byte_addr = reinterpret_cast(scalar_data); + + // Prefetch all scalar data at once + if(tid < cachelinesNeeded) + { + if(enable_prefetch) + { + // Prefetch the cacheline + PrefetchOp{}(byte_addr + tid * cachelineSize); + } + } + + T sum = 0; + if(tid < NUM_THREADS) + { + sum = src[tid]; // load from global mem to give time for prefetch to finish or be close to + // finish + } + __syncthreads(); // waits on loads from global mem + if(tid < NUM_THREADS) + { + // Access prefetched scalar data + for(uint32_t i = 0; i < NUM_SCALARS; i++) + { + sum += scalar_data[i]; // should be fast due to scalars being preloaded + } + + dst[tid] = sum; + } +} + +template +__global__ void kernel_with_prefetch_and_shared_mem(KernelArgs args) +{ + const T* src = args.p_a_grid; + T* dst = args.dst; + const T* scalar_data = args.p_b_grid; + bool enable_prefetch = args.enable_prefetch; + + __shared__ T sharedMem[32]; + + uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x; + + // Calculate number of 32B cachelines needed to cover num_scalars elements + constexpr index_t cachelineSize = 32; + constexpr index_t elements_per_cachelineSize = cachelineSize / sizeof(T); + constexpr unsigned int cachelinesNeeded = + (NUM_SCALARS + elements_per_cachelineSize - 1) / elements_per_cachelineSize; + + bool use_shared_mem = tid % 2 == 1; + + const void* byte_addr; + if(use_shared_mem) + { + byte_addr = reinterpret_cast(sharedMem); + } + else + { + uintptr_t base = reinterpret_cast(scalar_data); + uintptr_t offset = base + (tid / 2) * cachelineSize; + byte_addr = reinterpret_cast(offset); + } + + // Prefetch all scalar data at once + if(tid < cachelinesNeeded * 2) + { + if(enable_prefetch) + { + // Prefetch the cacheline + PrefetchOp{}(byte_addr); + } + else + { + (void)byte_addr; + } + } + + T sum = 0; + if(tid < NUM_THREADS) + { + sum = src[tid]; // load from global mem to give time for prefetch to finish or be close to + // finish + } + __syncthreads(); // waits on loads from global mem + if(tid < NUM_THREADS) + { + // Access prefetched scalar data + for(uint32_t i = 0; i < NUM_SCALARS; i++) + { + sum += scalar_data[i]; // should be fast due to scalars being preloaded + } + + dst[tid] = sum; + } +} + +template +bool test_prefetch_impl(bool time_kernels, + const PrefetchKernel& prefetch_kernel, + const std::string& kernel_name) +{ + constexpr index_t block_size = 256; + constexpr index_t num_elements = NUM_THREADS; + constexpr index_t num_scalars = NUM_SCALARS; + + // TODO: maybe add more prefetch instructions inside kernel to support more values + assert(NUM_SCALARS / sizeof(T) < (32 * block_size) && + "Too many scalars to prefetch with current implementation!"); + + constexpr index_t grid_size = (num_elements + block_size - 1) / block_size; + + std::cout << "Testing " << kernel_name << " for type: " << typeid(T).name() << std::endl; + std::cout << "Elements: " << num_elements << ", Scalars: " << num_scalars << std::endl; + + // Host data + std::vector h_src(num_elements); + std::vector h_scalar(num_scalars); + std::vector h_dst_with_prefetch_chunks(num_elements); + std::vector h_expected(num_elements); + + // Initialize data + for(index_t i = 0; i < num_elements; i++) + { + h_src[i] = static_cast(i % 100); + } + + T scalar_sum = 0; + for(index_t i = 0; i < num_scalars; i++) + { + h_scalar[i] = static_cast(i + 1); + scalar_sum += h_scalar[i]; + } + + // Expected results + for(index_t i = 0; i < num_elements; i++) + { + h_expected[i] = h_src[i] + scalar_sum; + } + + // Device memory + DeviceMem d_src(sizeof(T) * num_elements); + DeviceMem d_scalar(sizeof(T) * num_scalars); + DeviceMem d_dst_with_prefetch_chunks(sizeof(T) * num_elements); + + d_src.ToDevice(h_src.data()); + d_scalar.ToDevice(h_scalar.data()); + + KernelArgs args{static_cast(d_src.GetDeviceBuffer()), + static_cast(d_dst_with_prefetch_chunks.GetDeviceBuffer()), + static_cast(d_scalar.GetDeviceBuffer()), + true}; + if(time_kernels) + { + std::array avg_times_us; + ck::static_for<0, 2, 1>{}([&](auto static_i) { + constexpr bool prefetch_enabled = static_i == 0; + std::cout << "PREFETCH " << (prefetch_enabled ? "ENABLED!" : "DISABLED!") << std::endl; + + args.enable_prefetch = prefetch_enabled; + + constexpr int num_warmup = 1; + constexpr int num_iterations = 10; + constexpr int rotating_count = num_iterations; + auto size_a_buffer = d_src.GetBufferSize(); + auto size_b_buffer = d_scalar.GetBufferSize(); + + ck::utility::RotatingMemWrapper> rotating_mem( + args, rotating_count, size_a_buffer, size_b_buffer); + rotating_mem.Print(); + + auto run_flush_cache = [&]() { + // flush icache + ck::utility::flush_icache(); + // rotating mem + rotating_mem.Next(); + }; + float avg_time_ms = ck::utility::launch_and_time_kernel_with_preprocess( + StreamConfig{nullptr, true, 0, num_warmup, num_iterations, true, rotating_count}, + run_flush_cache, + prefetch_kernel, + dim3(grid_size), + dim3(block_size), + 0, + args); + + float avg_time_us = avg_time_ms * 1000.0f; + float total_bytes = (size_a_buffer + size_b_buffer); // read + float bandwidth_gb_s = (total_bytes / (avg_time_us * 1e-6)) / 1e9; + float ops_per_iteration = num_elements * num_scalars; // adds + float gflops = (ops_per_iteration / (avg_time_us * 1e-6)) / 1e9; + + std::cout << " Performance: " << std::endl; + std::cout << " Average kernel time: " << avg_time_us << " us" << std::endl; + std::cout << " Effective bandwidth: " << bandwidth_gb_s << " GB/s" << std::endl; + std::cout << " Compute throughput: " << gflops << " GFLOPS" << std::endl; + + avg_times_us[static_i] = avg_time_us; + }); + + float speedup = avg_times_us[1] / avg_times_us[0]; + + std::cout << "On average kernel with prefetch is " << speedup + << " times faster than without prefetch." << std::endl; + + if(speedup < 1.0f) + std::cout << "WARNING: prefetch kernel is slower!" << std::endl; + } + else + { + launch_and_time_kernel(StreamConfig{nullptr, false}, + prefetch_kernel, + dim3(grid_size), + dim3(block_size), + 0, // lds_byte + args); + } + + // Copy results back + d_dst_with_prefetch_chunks.FromDevice(h_dst_with_prefetch_chunks.data()); + + // Verify results + bool pass = ck::utils::check_err(h_dst_with_prefetch_chunks, h_expected); + + std::cout << " Correctness: " << (pass ? "PASS" : "FAIL") << std::endl; + std::cout << std::endl; + + return pass; +} + +} // namespace prefetch_op_util +} // namespace ck diff --git a/test/s_prefetch_inst_op/s_prefetch_inst_op.cpp b/test/s_prefetch_inst_op/s_prefetch_inst_op.cpp index 484ead5c3e..f072d5f8f7 100644 --- a/test/s_prefetch_inst_op/s_prefetch_inst_op.cpp +++ b/test/s_prefetch_inst_op/s_prefetch_inst_op.cpp @@ -1,39 +1,39 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#include "ck/ck.hpp" -#include "ck/host_utility/device_prop.hpp" - -#include "s_prefetch_inst_op_util.hpp" - -template -bool run_test(bool time_kernels) -{ - return ck::s_prefetch_inst_op_util::test_inst_prefetch_impl( - time_kernels, "s_prefetch_inst_pc_rel"); -} - -int main(int argc, char* argv[]) -{ - if(!ck::is_gfx12_supported()) - { - std::cout << "instruction cache prefetch is not supported by current HW, skipping tests." - << std::endl; - return 0; - } - - bool time_kernels = false; - if(argc == 2) - { - time_kernels = std::stoi(argv[1]); - } - - bool pass = true; - - std::cout << "=== Testing Instruction Prefetch ===" << std::endl; - - pass &= run_test(time_kernels); - - std::cout << "TestInstPrefetch ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl; - return pass ? 0 : 1; -} +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "ck/ck.hpp" +#include "ck/host_utility/device_prop.hpp" + +#include "s_prefetch_inst_op_util.hpp" + +template +bool run_test(bool time_kernels) +{ + return ck::s_prefetch_inst_op_util::test_inst_prefetch_impl( + time_kernels, "s_prefetch_inst_pc_rel"); +} + +int main(int argc, char* argv[]) +{ + if(!ck::is_gfx12_supported()) + { + std::cout << "instruction cache prefetch is not supported by current HW, skipping tests." + << std::endl; + return 0; + } + + bool time_kernels = false; + if(argc == 2) + { + time_kernels = std::stoi(argv[1]); + } + + bool pass = true; + + std::cout << "=== Testing Instruction Prefetch ===" << std::endl; + + pass &= run_test(time_kernels); + + std::cout << "TestInstPrefetch ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl; + return pass ? 0 : 1; +}